From 6aecd3da132c62e9b38c1596f62865b8178a73d6 Mon Sep 17 00:00:00 2001
From: qiaojbao <Qiaojin.Bao@amd.com>
Date: Mon, 23 Dec 2024 12:00:07 +0800
Subject: [PATCH] Update llpc from commit 9046d0be

lgc: Refactor wave size related code
lgc: Set insert point in visitStridedIndexAdd
[NFC] Rename GS calcFactor to hwConfig
[Continuations] Refactor around maximum passed
Fix PsExtraLdsSize was not being merged in the new metadata
Do some renaming for tessellation terms
Add transform shader passes
[Continuations] Add check for strided buffer load to improve performance
[Continuations] Remove 32-bit GPURT define
Add more dialect ops to support NGG lowering
[NFC] Rename some passes
VK_EXT_shader_replicated_composites - Frontend Shader Compiler Implementation
Expanding precision underflow peephole to pow
Fix assert crash in dEQP-VK.ycbcr.query.lod.fragment.*
Vulkan Shader/Fine MALL Tuning
Use zero-sized array for variable length array
Hash vertex shader for transform shader
[DEBUG] Add InstructionSlot class
RT: Ensure payload and hit attribute are scalar-aligned
Add option forceMemoryBarrierScope
Move the adjustment of Ij to lowerInOut
Fix the lit test failures
llvmraytracing: Remove unused MaxContStateBytes
lgc: zero-sized array follow-up
Update PeepholeOptPhiWithIdenticalLoad.lgc
Restore ostream format for pipeline dump
RT: Fix another likely issue with multithreaded compiles
Add support for MbPassManager
Make SpirvLowerRayTracing re-usable
Add lit tests for subgroup shuffle codegen quality
Use readlane for shuffle with constant index
llvmraytracing: Add debug print to PipelineState
vfx: Fix printf format strings
More use of IRBuilder
Hit the unused 'gfxIp' local variables
[Continuations] Fix up lit tests with jump arguments
Hook up AmdExtD3DShaderIntrinsics_Halt
Remove the unnecessary internal built-in GsWaveId
Only align elf when it is actually an ELF
Fix the parser of gfxip
Remove m_out
Remove uses of getPointerTo
Update transGroupArithOp to use Optional ClusterSize
[Continuations] Pass 'shader index' as argument
Add an extra simplify pass to Gpurt lib passes
Set correct syncScope in transScope
lgc: Make task payload buffer desc globally coherent
lgc: Use BuilderImpl::buildBufferCompactDesc() in LowerBufferOperations
Enable LCG test after LLVM update
[Continuations] Add 'RayTracingShaderStage' helper
Optimize HS output writes for zero/negative outer TFs
[VKD3D] Add rtIgnoreDeclaredPayloadSize for Proton RT games
[Continuations] Propagate used payload count to intersection shaders
Remove 16-bit transform feedback support
[Continuations] Remove 'AwaitTraversal' return address WA.
lgc: Refine handling of GpurtFloatWithRoundModeOp
Update the LLPC_INTERFACE_MINOR_VERSION to 9
Use iterator for insertion position
RT: start to use a ModuleBunch representation
Test updates for entry_point change
Disable fast math for Position only in pre-fragment stage.
[Continuations] Cleanup system data / hit attribute arguments
[Continuations] Cleanups in LowerRaytracingPipeline
llvmraytracing: Some Builder-related cleanups
Test updates for entry_point change extra
Update tests for new LLVM upstream
[Continuations] Update 'lower-traversal.ll' test.
[Continuations] Early exit from replaceAllPointerUses and remove unused argument
Fix amber tests
lgccps: Run InstCombine before CoroSplit
Unify the InternalDescriptorSetId with D3D
lgc: Create StridedBufferDescToPtrOp for strided normal descriptor
lgc: Relax occupancy target for memory bound functions
lgc: Add globallyCoherent flag to StridedBufferDescToPtrOp
[compilerutils] Remove getWithSamePointeeType
[Continuations] Continuation State Builder -- Basic
[Continuations] Use generically typed opaque.use in cont-state-builder lit-test
---
 cmake/CompilerFlags.cmake                     |   25 +
 cmake/findllvm.cmake                          |    6 +-
 cmake/llvm.cmake                              |    6 +
 compilerutils/CMakeLists.txt                  |   25 +
 .../include/compilerutils/CompilerUtils.h     |    9 +-
 .../include/compilerutils/ModuleBunch.h       |    4 +-
 compilerutils/lib/CompilerUtils.cpp           |   37 +-
 compilerutils/plugin/CMakeLists.txt           |   25 +
 compilerutils/test/CMakeLists.txt             |   25 +
 .../test/dxil-to-llvm/simple-i1-vec.dxil      |  129 --
 .../test/value-origin-tracking/basic-tests.ll |    6 +-
 docs/DdnRelocatableShaderElf.md               |    4 +-
 include/vkgcDefs.h                            |   47 +-
 lgc/CMakeLists.txt                            |    6 +-
 lgc/builder/ArithBuilder.cpp                  |   82 +
 lgc/builder/BuilderRecorder.cpp               |   15 +
 lgc/builder/BuilderRecorder.h                 |    1 +
 lgc/builder/BuilderReplayer.cpp               |    4 +
 lgc/builder/DescBuilder.cpp                   |   37 +-
 lgc/builder/ImageBuilder.cpp                  |    1 -
 lgc/builder/InOutBuilder.cpp                  |   61 +-
 lgc/builder/MatrixBuilder.cpp                 |    4 +-
 lgc/builder/MiscBuilder.cpp                   |   21 +-
 lgc/builder/SubgroupBuilder.cpp               |   21 +-
 lgc/builder/YCbCrAddressHandler.cpp           |    4 +-
 lgc/elfLinker/ColorExportShader.cpp           |    4 +-
 lgc/elfLinker/ColorExportShader.h             |    2 +-
 lgc/elfLinker/NullFragmentShader.cpp          |    2 +-
 lgc/include/lgc/builder/BuilderImpl.h         |   10 +-
 lgc/include/lgc/patch/FragmentColorExport.h   |   16 +-
 ...upMemory.h => InitializeWorkgroupMemory.h} |    6 +-
 lgc/include/lgc/patch/LowerBufferOperations.h |   11 +-
 .../lgc/patch/LowerCooperativeMatrix.h        |   11 +-
 lgc/include/lgc/patch/LowerInOut.h            |   19 +-
 lgc/include/lgc/patch/PassthroughHullShader.h |   12 +-
 lgc/include/lgc/patch/PeepholeOptimization.h  |   12 +-
 lgc/include/lgc/patch/PreparePipelineAbi.h    |    7 +-
 lgc/include/lgc/patch/SetupTargetFeatures.h   |    4 +-
 lgc/include/lgc/patch/ShaderInputs.h          |    2 +-
 lgc/include/lgc/state/Abi.h                   |    2 +-
 lgc/include/lgc/state/Defs.h                  |    9 -
 lgc/include/lgc/state/IntrinsDefs.h           |   28 +-
 lgc/include/lgc/state/PipelineState.h         |   14 +-
 lgc/include/lgc/state/ResourceUsage.h         |   59 +-
 lgc/interface/lgc/Builder.h                   |   13 +-
 lgc/interface/lgc/BuilderCommon.h             |    5 +-
 lgc/interface/lgc/BuiltInDefs.h               |    1 -
 .../lgc/util => interface/lgc}/Debug.h        |   36 +
 lgc/interface/lgc/LgcContext.h                |    3 +-
 lgc/interface/lgc/LgcDialect.td               |  165 +-
 lgc/interface/lgc/PassManager.h               |    1 +
 lgc/interface/lgc/Pipeline.h                  |    8 +-
 lgc/patch/CollectResourceUsage.cpp            |  131 +-
 lgc/patch/Continufy.cpp                       |    2 +-
 lgc/patch/FragmentColorExport.cpp             |   80 +-
 lgc/patch/GenerateCopyShader.cpp              |   27 +-
 lgc/patch/GenerateNullFragmentShader.cpp      |    2 +-
 ...mory.cpp => InitializeWorkgroupMemory.cpp} |   14 +-
 lgc/patch/LgcLowering.cpp                     |   18 +-
 lgc/patch/LowerBufferOperations.cpp           |  240 ++-
 lgc/patch/LowerCooperativeMatrix.cpp          |  174 ++-
 lgc/patch/LowerGpuRt.cpp                      |   23 +-
 lgc/patch/LowerInOut.cpp                      | 1083 ++++++-------
 lgc/patch/LowerInvariantLoads.cpp             |   26 +-
 lgc/patch/MeshTaskShader.cpp                  |    8 +-
 lgc/patch/MutateEntryPoint.cpp                |   34 +-
 lgc/patch/NggPrimShader.cpp                   | 1218 ++++++++-------
 lgc/patch/NggPrimShader.h                     |    8 +-
 lgc/patch/PassRegistry.inc                    |   10 +-
 lgc/patch/PassthroughHullShader.cpp           |   24 +-
 lgc/patch/PeepholeOptimization.cpp            |   70 +-
 lgc/patch/PreparePipelineAbi.cpp              |  290 +++-
 lgc/patch/RegisterMetadataBuilder.cpp         |   65 +-
 lgc/patch/SetupTargetFeatures.cpp             |    8 +-
 lgc/patch/ShaderInputs.cpp                    |    9 +-
 lgc/patch/ShaderMerger.cpp                    |  177 ++-
 lgc/patch/ShaderMerger.h                      |    3 +-
 lgc/patch/StructurizeBuffers.cpp              |    2 +-
 lgc/patch/SystemValues.cpp                    |   10 +-
 lgc/patch/VertexFetch.cpp                     |   12 +-
 lgc/state/LgcContext.cpp                      |    8 +-
 lgc/state/PassManagerCache.cpp                |    2 +-
 lgc/state/PipelineState.cpp                   |  129 +-
 lgc/state/ResourceUsage.cpp                   |    4 +-
 lgc/test/BuiltIns/cs-numworkgroups.lgc        |    2 +-
 lgc/test/BuiltIns/cs-workgroupid.lgc          |    2 +-
 lgc/test/CsComputeLibrary.lgc                 |    2 +-
 lgc/test/CsLowerDebugPrintf.lgc               |    6 +-
 lgc/test/ImageSampleNoReturn.lgc              |    2 +-
 lgc/test/PeepholeOptPhiWithIdenticalLoad.lgc  |   21 +-
 lgc/test/ShaderStages.lgc                     |   36 +-
 lgc/test/TaskShaderOps.lgc                    |    6 +-
 lgc/test/TaskShaderRegConfig.lgc              |    2 +-
 lgc/test/Transforms/Continufy/simple.lgc      |   83 -
 .../CpsLowering/continuation-basic.lgc        |  125 +-
 .../CpsLowering/cps-entry-point.lgc           |    6 +-
 .../CpsLowering/cps-from-continufy.lgc        |  420 -----
 .../CpsLowering/cps-stack-lowering.lgc        |  627 ++++----
 .../CpsLowering/cps-unify-exits.lgc           |  262 ++--
 .../InvariantStartUserWithPhiNode.lgc         |   10 +-
 .../buffer-index-op.lgc                       |    0
 .../buffer.atomic.ops.lgc                     |   60 +-
 .../simple.lgc                                |  122 +-
 .../strided-buffer-ops.lgc                    |   82 +-
 .../uniform-phi.lgc                           |    8 +-
 .../LowerCooperativeMatrix/gfx1010muladd.lgc  |   12 +-
 .../LowerCooperativeMatrix/gfx1011muladd.lgc  |   12 +-
 .../{bf16muladd.lgc => gfx1100muladd.lgc}     |   15 +-
 .../LowerCooperativeMatrix/load-wave64.lgc    |   15 +-
 .../LowerCooperativeMatrix/store-wave64.lgc   |   12 +-
 .../Transforms/LowerDebugPrintf/basic.lgc     |    2 +-
 .../PeepholeOptLog2PowUnderflow.lgc           |  132 ++
 .../PeepholeOpt/PeepholeOptLog2Underflow.lgc  |   69 -
 lgc/test/UberFetchShader.lgc                  |    2 +-
 lgc/test/WorkgroupIdOpt.lgc                   |    6 +-
 lgc/test/lgcdis.lgc                           |    2 +-
 lgc/util/Debug.cpp                            |   41 +-
 lgc/util/GfxRegHandler.cpp                    |    3 +
 lgc/util/PassManager.cpp                      |   25 +-
 lgc/util/StartStopTimer.cpp                   |    2 +-
 llpc/CMakeLists.txt                           |    6 +
 llpc/context/llpcCompiler.cpp                 |  554 +++----
 llpc/context/llpcCompiler.h                   |    8 +-
 llpc/context/llpcComputeContext.cpp           |   20 +-
 llpc/context/llpcComputeContext.h             |    6 +-
 llpc/context/llpcGraphicsContext.cpp          |   87 +-
 llpc/context/llpcGraphicsContext.h            |    3 -
 llpc/context/llpcPipelineContext.cpp          |   81 +
 llpc/context/llpcPipelineContext.h            |    4 +
 llpc/context/llpcRayTracingContext.cpp        |    7 +-
 llpc/lowering/LinkTransformShaders.cpp        |  139 ++
 llpc/lowering/LinkTransformShaders.h          |   49 +
 llpc/lowering/LowerAccessChain.cpp            |    2 +-
 llpc/lowering/LowerGlCompatibility.cpp        |  259 +---
 llpc/lowering/LowerGlCompatibility.h          |    4 +-
 llpc/lowering/LowerGlobals.cpp                |    7 +-
 .../LowerInternalLibraryIntrinsic.cpp         |    5 +-
 llpc/lowering/LowerMath.cpp                   |   21 +-
 llpc/lowering/LowerMath.h                     |    1 +
 llpc/lowering/LowerRayTracing.cpp             |  427 +++--
 llpc/lowering/LowerRayTracing.h               |  122 +-
 llpc/lowering/Lowering.cpp                    |   12 +-
 llpc/lowering/Lowering.h                      |   14 +-
 llpc/lowering/LoweringUtil.cpp                |   28 +-
 llpc/lowering/LoweringUtil.h                  |   21 +-
 .../lowering/PrepareTransformVertexShader.cpp |  183 +++
 llpc/lowering/PrepareTransformVertexShader.h  |   63 +
 llpc/lowering/ProcessGpuRtLibrary.cpp         |   30 +-
 .../bugs/ArrayOfVariablePointers.spvasm       |    2 +-
 ...onUniform_TestTexutreLoadStoreInt64.spvasm |    2 +-
 .../core/OpAtomicXXX_TestImage_lit.elf        |  Bin 10616 -> 0 bytes
 .../OpAtomicXXX_TestSharedVariable_lit.comp   |   28 +-
 ...stStorageBlockAndSharedWithData64_lit.comp |   72 +-
 .../core/OpFNegate_TestDvec3_lit.frag         |    2 +-
 .../OpImageRead_TestInt64ImageLoad.spvasm     |    2 +-
 ...SparseRead_TestInt64SparseImageLoad.spvasm |    2 +-
 .../shaderdb/core/TestXfbStateMetadata.vert   |    2 +-
 ...PipelineGsTess_TestVsTesGsMergeShader.pipe |  515 ++++---
 .../PipelineGs_TestVsGSMergeShader.pipe       |   20 +-
 .../ExtShaderFloat16_TestInterpFuncs_lit.frag |    8 +-
 ...ShaderInt8_TestSharedVarLoadStore_lit.comp |    4 +-
 .../ExtXfb_TessGsDoubleOutput_lit.geom        |    4 +-
 .../ExtXfb_TestGsFloatOutput_lit.geom         |    4 +-
 .../ExtXfb_TestTesDoubleOutput_lit.tese       |    4 +-
 .../ExtXfb_TestTesFloatOutput_lit.tese        |    4 +-
 .../ExtXfb_TestVsDoubleOutput_lit.vert        |    4 +-
 .../ExtXfb_TestVsFloatOutput_lit.vert         |    4 +-
 ...OpExtInst_TestInterpolateAtOffset_lit.frag |   10 +-
 ...OpExtInst_TestInterpolateAtSample_lit.frag |    9 +-
 ...pExtInst_TestInterpolateDynIdx1DArray.frag |    7 +-
 ..._TestInterpolateDynIdx1DArrayInStruct.frag |    8 +-
 ...st_TestInterpolateDynIdx1DStructArray.frag |    4 -
 ..._TestInterpolateDynIdx2DArrayInStruct.frag |    4 -
 ...terpolateDynIdx2DArrayInStructInArray.frag |    3 +-
 ...st_TestInterpolateDynIdx2DStructArray.frag |    2 -
 ...pExtInst_TestInterpolateDynIdx3DArray.frag |    4 -
 ...OpExtInst_TestInterpolateDynIdxVector.frag |   17 +-
 .../OpExtInst_TestUnpackHalf2x16_lit.frag     |    2 +-
 .../CallInstAsUserOfGlobalVariable.spvasm     |    4 +-
 llpc/test/shaderdb/general/ImgDescLoad.comp   |    4 +-
 .../PipelineCs_ForceMemoryBarrierScope.pipe   |   23 +
 .../PipelineCs_LdsSpillLimitDwordsOption.pipe |    2 +-
 .../PipelineCs_MultipleRootInlineBuffer.pipe  |    2 +-
 ...ineTcsTes_TestLocMapLoadBuiltInOutput.pipe |    9 +-
 .../PipelineTess_TestInOutPacking.pipe        |  112 +-
 .../PipelineVsFs_DynamicSampleInfo.pipe       |  142 +-
 .../general/PipelineVsFs_TestNullFs.pipe      |    2 +-
 .../general/PipelineVsFs_TestUberShader.pipe  |    2 +-
 .../general/SubgroupShuffleIndexConstant.comp |   25 +
 .../SubgroupShuffleIndexDivergent.comp        |   31 +
 .../general/SubgroupShuffleIndexUniform.comp  |   31 +
 .../shaderdb/general/TestPatchBufferOp.comp   |    2 +-
 .../shaderdb/general/TestWorkgroupIdOpt.comp  |    5 +-
 .../shaderdb/general/UndefVertexOutput.spvasm |    4 +-
 .../PipelineVsFs_TestVsOutMiscSideBusEna.pipe |    2 +-
 .../shaderdb/gfx11/SgprUserDataInit_Cs.pipe   |    2 +-
 .../shaderdb/gfx11/SgprUserDataInit_Fs.pipe   |    4 +-
 .../gfx11/TessFactorStoreWithOpt.pipe         |   80 +-
 .../shaderdb/gfx11/TestGsXfbWithHole.pipe     |   10 +-
 .../cooperativeMatrix/array-of-matrices.comp  |    6 +-
 .../cooperativeMatrix/extract-insert.spvasm   |   10 +-
 .../cooperativeMatrix/loadstore-uvec4.comp    |    4 +-
 ...ut_TestIndexingInterpOfInputArray_lit.frag |  305 +++-
 .../ObjStorageBlock_TestRowMajor_lit.frag     |    6 +-
 .../ObjStorageBlock_TestRuntimeArray_lit.vert |   43 +-
 .../shaderdb/object/ObjXfb_TestBasic_lit.vert |    2 +-
 .../ray_tracing/PipelineRayquery.pipe         |    6 +-
 .../PipelineRays_Continuations.pipe           |   12 +
 ...inuations_IntersectionShaderVgprCount.pipe |  336 ++++
 ...Continuations_SpecializeDriverShaders.pipe |    4 +-
 .../ray_tracing/PipelineRays_Irreducible.pipe |    2 +-
 ...eRays_TestRtIgnoreDeclaredPayloadSize.pipe |  280 ++++
 .../shaderdb/ray_tracing/TestContState.rgen   |   39 +
 .../ray_tracing/TestPayloadSizes.rgen         |   48 +
 .../PipelineGs_BasicRelocGsTest.pipe          |    6 +-
 .../PipelineVsFs_EnableColorExport.pipe       |    8 +-
 .../relocatable_shaders/VsGs_Reloc.spvasm     |    4 +-
 llpc/tool/amdllpc.cpp                         |    2 +-
 llpc/translator/lib/SPIRV/SPIRVReader.cpp     |  497 +++---
 llpc/translator/lib/SPIRV/SPIRVReader.h       |    6 +-
 .../lib/SPIRV/libSPIRV/SPIRVInstruction.cpp   |   12 +-
 .../lib/SPIRV/libSPIRV/SPIRVInstruction.h     |   40 +
 .../lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h     |   11 +
 .../lib/SPIRV/libSPIRV/SPIRVModule.cpp        |    8 +
 .../lib/SPIRV/libSPIRV/SPIRVModule.h          |    2 +
 .../lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h     |    5 +
 .../lib/SPIRV/libSPIRV/SPIRVOpCodeEnum.h      |    3 +
 .../lib/SPIRV/libSPIRV/SPIRVType.cpp          |    6 +-
 .../translator/lib/SPIRV/libSPIRV/SPIRVType.h |    7 +-
 .../lib/SPIRV/libSPIRV/SPIRVValue.h           |   10 +
 llpc/unittests/context/testOptLevel.cpp       |    2 +-
 llpc/util/llpcElfWriter.cpp                   |    1 +
 llpc/util/llpcError.cpp                       |    7 +-
 llpc/util/llpcError.h                         |    9 +-
 llpc/util/llpcThreading.cpp                   |  255 +++
 llpc/util/llpcThreading.h                     |   75 +-
 llvmraytracing/CMakeLists.txt                 |   26 +
 llvmraytracing/README.md                      |   21 +-
 llvmraytracing/include/lgc/LgcCpsDialect.td   |    4 +-
 llvmraytracing/include/lgc/LgcIlCpsDialect.td |   24 +-
 .../include/llvmraytracing/Continuations.h    |    7 +-
 .../llvmraytracing/ContinuationsDialect.h     |   25 +
 .../llvmraytracing/ContinuationsUtil.h        |   13 +-
 .../include/llvmraytracing/CpsStackLowering.h |    4 +-
 .../include/llvmraytracing/PipelineState.h    |    6 +
 .../llvmraytracing/SpecializeDriverShaders.h  |    2 +
 llvmraytracing/lib/CleanupContinuations.cpp   |   69 +-
 llvmraytracing/lib/ContStateBuilder.cpp       | 1369 +++++++++++++++++
 llvmraytracing/lib/ContStateBuilder.h         |   45 +
 llvmraytracing/lib/Continuations.cpp          |   60 +-
 llvmraytracing/lib/ContinuationsLint.cpp      |   43 +-
 .../lib/ContinuationsStatsReport.cpp          |    3 +-
 llvmraytracing/lib/CpsStackLowering.cpp       |   58 +-
 .../lib/DXILContLgcRtOpConverter.cpp          |    4 +-
 llvmraytracing/lib/DXILContPostProcess.cpp    |  294 +---
 .../lib/DXILContPrepareGpurtLibrary.cpp       |   18 +-
 llvmraytracing/lib/LgcCpsJumpInliner.cpp      |    2 +-
 llvmraytracing/lib/LowerAwait.cpp             |   40 +-
 .../lib/LowerRaytracingPipeline.cpp           |  550 +++----
 llvmraytracing/lib/PipelineState.cpp          |   12 +
 .../lib/SpecializeDriverShaders.cpp           |  146 +-
 llvmraytracing/plugin/CMakeLists.txt          |   25 +
 llvmraytracing/test/CMakeLists.txt            |   25 +
 .../test/dx/cleanup-continuations-malloc.ll   |   74 -
 .../test/dx/cleanup-continuations.ll          |  294 ----
 llvmraytracing/test/dx/closest-hit.ll         |  200 ---
 .../test/dx/continuation-registercount.ll     |  305 ----
 .../test/dx/continuation-stacksize.ll         |  189 ---
 llvmraytracing/test/dx/continuation-state.ll  |  121 --
 ...nt-convert-lgc-rt-op-trace-payload-type.ll |  276 ----
 .../dx/dxil-cont-convert-lgc-rt-op-trace.ll   |  165 --
 .../test/dx/dxil-cont-convert-lgc-rt-op.ll    |  139 --
 .../test/dx/dxil-cont-post-process.ll         |   37 -
 ...t-prepare-gpurt-library-remove-waitmask.ll |   90 --
 .../dx/dxil-cont-prepare-gpurt-library.ll     |   90 --
 ...-raygen-cont-state-in-persistent-launch.ll |  282 ----
 .../test/dx/inline-const-jump-target.ll       |  158 --
 .../test/dx/intersection-registercount.ll     |  184 ---
 llvmraytracing/test/dx/intrinsics/complete.ll |   76 -
 .../cont-payload-registers-i32-count.ll       |   64 -
 .../test/dx/intrinsics/cont-stack-access.ll   |   98 --
 .../test/dx/intrinsics/cont-stack-alloc.ll    |   81 -
 .../continuation-stack-is-global-false.ll     |   38 -
 .../continuation-stack-is-global-true.ll      |   38 -
 .../dx/intrinsics/get-current-func-addr.ll    |   52 -
 llvmraytracing/test/dx/intrinsics/get-rtip.ll |   28 -
 .../test/dx/intrinsics/get-setting.ll         |   25 -
 .../test/dx/intrinsics/get-shader-kind.ll     |   76 -
 .../test/dx/intrinsics/get-shader-rec-idx.ll  |   95 --
 llvmraytracing/test/dx/intrinsics/is-llpc.ll  |   26 -
 .../test/dx/intrinsics/shader-index.ll        |   89 --
 .../test/dx/intrinsics/value-i32.ll           |   46 -
 .../multiple-setlocalrootindex-pre-coro.ll    |   26 -
 .../dx/lint/multiple-setlocalrootindex.ll     |   24 -
 .../test/dx/lint/undef-jump-target.ll         |   22 -
 llvmraytracing/test/dx/lower-await.ll         |  257 ----
 .../test/dx/lower-rt-pipeline-exit-raygen.ll  |   95 --
 .../lower-rt-pipeline-small-payload-field.ll  |  259 ----
 .../test/dx/paq-hit-attribute-size.ll         |  992 ------------
 .../test/dx/payload-caller-in-paq.ll          |  263 ----
 .../test/dx/payload-caller-in-paq.ll.hlsl     |   22 -
 .../test/dx/payload-save-registers.ll         | 1285 ----------------
 .../test/dx/payload-save-registers.ll.hlsl    |   34 -
 llvmraytracing/test/dx/remat-indirect-load.ll |   46 -
 .../test/dx/remove-types-metadata.ll          |  545 -------
 .../test/dx/remove-unused-declarations.ll     |  183 ---
 .../dx/specialize-driver-shaders/analysis.ll  |  469 ------
 .../lower-rt-pipeline-args.ll                 |  468 ------
 .../specialization.ll                         |  104 --
 llvmraytracing/test/dx/stats-report-sizes.ll  |   58 -
 .../test/dx/unnamed-type-intrinsics.ll        |  494 ------
 llvmraytracing/test/dx/wrong-system-data.ll   |  228 ---
 .../test/intrinsics/discard-values.ll         |    2 -
 .../intrinsics/get-func-addr-not-found.ll     |    4 +-
 .../test/intrinsics/get-func-addr.ll          |    8 +-
 .../test/intrinsics/shader-start.ll           |   14 +-
 .../lgccps/CpsLowering/continuation-basic.ll  |    4 +-
 .../lgccps/CpsLowering/cps-entry-point.ll     |    6 +-
 .../lgccps/CpsLowering/cps-from-continufy.ll  |   18 +-
 .../cps-stack-lowering-dxil-global.ll         |  244 ---
 .../cps-stack-lowering-dxil-scratch.ll        |  247 ---
 .../lgccps/CpsLowering/cps-stack-lowering.ll  |   16 +-
 llvmraytracing/test/lgccps/alloca-select.ll   |   79 +-
 llvmraytracing/test/lgccps/await-if-else.ll   |   66 +-
 llvmraytracing/test/lgccps/await-if.ll        |   41 +-
 llvmraytracing/test/lgccps/await-in-loop.ll   |   49 +-
 .../test/lgccps/call-shader-i1-payload.ll     |   98 +-
 .../test/lgccps/cleanup-store-loads.ll        |   33 +-
 llvmraytracing/test/lgccps/cps-no-await.ll    |    4 +-
 .../test/lgccps/entry-point-with-cps.ll       |   61 +-
 .../cont-payload-registers-get-i32.ll         |    6 +-
 .../cont-payload-registers-i32-count.ll       |    6 +-
 .../cont-payload-registers-set-i32.ll         |    6 +-
 llvmraytracing/test/lgccps/lower-traversal.ll |  622 --------
 llvmraytracing/test/lgccps/multiple-await.ll  |   58 +-
 .../test/lgccps/simple-await-more-state.ll    |   43 +-
 llvmraytracing/test/lgccps/simple-await.ll    |   55 +-
 .../lgccps/traversal-padding-hitattr-size.ll  |  124 ++
 llvmraytracing/unittests/CMakeLists.txt       |   25 +
 script/spv-to-shaderdb-test.py                |   26 +
 test/amber/a16.amber                          |    3 +-
 test/query_gfxip.py                           |  167 +-
 test/run_amber_test.py                        |   22 +-
 tool/dumper/vkgcPipelineDumper.cpp            |   46 +-
 tool/update_llpc_test_checks.py               |   26 +
 tool/vfx/vfxParser.cpp                        |   36 +
 tool/vfx/vfxSection.cpp                       |   10 +
 tool/vfx/vfxSection.h                         |    2 +
 tool/vfx/vfxVkSection.cpp                     |   25 +
 tool/vfx/vfxVkSection.h                       |   30 +-
 util/extensions.txt                           |    1 +
 util/vkgcCapability.h                         |    1 +
 util/vkgcExtension.cpp                        |    1 +
 util/vkgcExtension.h                          |    1 +
 version/CMakeLists.txt                        |    1 +
 version/include/llpc/GpurtIntrinsics.h        |    6 +-
 version/include/llpcVersion.h.in              |    4 +-
 357 files changed, 10388 insertions(+), 16869 deletions(-)
 delete mode 100644 compilerutils/test/dxil-to-llvm/simple-i1-vec.dxil
 rename lgc/include/lgc/patch/{PatchInitializeWorkgroupMemory.h => InitializeWorkgroupMemory.h} (90%)
 rename lgc/{include/lgc/util => interface/lgc}/Debug.h (64%)
 rename lgc/patch/{PatchInitializeWorkgroupMemory.cpp => InitializeWorkgroupMemory.cpp} (95%)
 delete mode 100644 lgc/test/Transforms/Continufy/simple.lgc
 delete mode 100644 lgc/test/Transforms/CpsLowering/cps-from-continufy.lgc
 rename lgc/test/Transforms/{PatchBufferOp => LowerBufferOperations}/InvariantStartUserWithPhiNode.lgc (91%)
 rename lgc/test/Transforms/{PatchBufferOp => LowerBufferOperations}/buffer-index-op.lgc (100%)
 rename lgc/test/Transforms/{PatchBufferOp => LowerBufferOperations}/buffer.atomic.ops.lgc (97%)
 rename lgc/test/Transforms/{PatchBufferOp => LowerBufferOperations}/simple.lgc (67%)
 rename lgc/test/Transforms/{PatchBufferOp => LowerBufferOperations}/strided-buffer-ops.lgc (97%)
 rename lgc/test/Transforms/{PatchBufferOp => LowerBufferOperations}/uniform-phi.lgc (91%)
 rename lgc/test/Transforms/LowerCooperativeMatrix/{bf16muladd.lgc => gfx1100muladd.lgc} (69%)
 create mode 100644 lgc/test/Transforms/PeepholeOpt/PeepholeOptLog2PowUnderflow.lgc
 delete mode 100644 lgc/test/Transforms/PeepholeOpt/PeepholeOptLog2Underflow.lgc
 create mode 100644 llpc/lowering/LinkTransformShaders.cpp
 create mode 100644 llpc/lowering/LinkTransformShaders.h
 create mode 100644 llpc/lowering/PrepareTransformVertexShader.cpp
 create mode 100644 llpc/lowering/PrepareTransformVertexShader.h
 delete mode 100644 llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.elf
 create mode 100644 llpc/test/shaderdb/general/PipelineCs_ForceMemoryBarrierScope.pipe
 create mode 100644 llpc/test/shaderdb/general/SubgroupShuffleIndexConstant.comp
 create mode 100644 llpc/test/shaderdb/general/SubgroupShuffleIndexDivergent.comp
 create mode 100644 llpc/test/shaderdb/general/SubgroupShuffleIndexUniform.comp
 create mode 100644 llpc/test/shaderdb/ray_tracing/PipelineRays_Continuations_IntersectionShaderVgprCount.pipe
 create mode 100644 llpc/test/shaderdb/ray_tracing/PipelineRays_TestRtIgnoreDeclaredPayloadSize.pipe
 create mode 100644 llpc/test/shaderdb/ray_tracing/TestContState.rgen
 create mode 100644 llpc/test/shaderdb/ray_tracing/TestPayloadSizes.rgen
 create mode 100644 llpc/util/llpcThreading.cpp
 create mode 100644 llvmraytracing/lib/ContStateBuilder.cpp
 create mode 100644 llvmraytracing/lib/ContStateBuilder.h
 delete mode 100644 llvmraytracing/test/dx/cleanup-continuations-malloc.ll
 delete mode 100644 llvmraytracing/test/dx/cleanup-continuations.ll
 delete mode 100644 llvmraytracing/test/dx/closest-hit.ll
 delete mode 100644 llvmraytracing/test/dx/continuation-registercount.ll
 delete mode 100644 llvmraytracing/test/dx/continuation-stacksize.ll
 delete mode 100644 llvmraytracing/test/dx/continuation-state.ll
 delete mode 100644 llvmraytracing/test/dx/dxil-cont-convert-lgc-rt-op-trace-payload-type.ll
 delete mode 100644 llvmraytracing/test/dx/dxil-cont-convert-lgc-rt-op-trace.ll
 delete mode 100644 llvmraytracing/test/dx/dxil-cont-convert-lgc-rt-op.ll
 delete mode 100644 llvmraytracing/test/dx/dxil-cont-post-process.ll
 delete mode 100644 llvmraytracing/test/dx/dxil-cont-prepare-gpurt-library-remove-waitmask.ll
 delete mode 100644 llvmraytracing/test/dx/dxil-cont-prepare-gpurt-library.ll
 delete mode 100644 llvmraytracing/test/dx/free-raygen-cont-state-in-persistent-launch.ll
 delete mode 100644 llvmraytracing/test/dx/inline-const-jump-target.ll
 delete mode 100644 llvmraytracing/test/dx/intersection-registercount.ll
 delete mode 100644 llvmraytracing/test/dx/intrinsics/complete.ll
 delete mode 100644 llvmraytracing/test/dx/intrinsics/cont-payload-registers-i32-count.ll
 delete mode 100644 llvmraytracing/test/dx/intrinsics/cont-stack-access.ll
 delete mode 100644 llvmraytracing/test/dx/intrinsics/cont-stack-alloc.ll
 delete mode 100644 llvmraytracing/test/dx/intrinsics/continuation-stack-is-global-false.ll
 delete mode 100644 llvmraytracing/test/dx/intrinsics/continuation-stack-is-global-true.ll
 delete mode 100644 llvmraytracing/test/dx/intrinsics/get-current-func-addr.ll
 delete mode 100644 llvmraytracing/test/dx/intrinsics/get-rtip.ll
 delete mode 100644 llvmraytracing/test/dx/intrinsics/get-setting.ll
 delete mode 100644 llvmraytracing/test/dx/intrinsics/get-shader-kind.ll
 delete mode 100644 llvmraytracing/test/dx/intrinsics/get-shader-rec-idx.ll
 delete mode 100644 llvmraytracing/test/dx/intrinsics/is-llpc.ll
 delete mode 100644 llvmraytracing/test/dx/intrinsics/shader-index.ll
 delete mode 100644 llvmraytracing/test/dx/intrinsics/value-i32.ll
 delete mode 100644 llvmraytracing/test/dx/lint/multiple-setlocalrootindex-pre-coro.ll
 delete mode 100644 llvmraytracing/test/dx/lint/multiple-setlocalrootindex.ll
 delete mode 100644 llvmraytracing/test/dx/lint/undef-jump-target.ll
 delete mode 100644 llvmraytracing/test/dx/lower-await.ll
 delete mode 100644 llvmraytracing/test/dx/lower-rt-pipeline-exit-raygen.ll
 delete mode 100644 llvmraytracing/test/dx/lower-rt-pipeline-small-payload-field.ll
 delete mode 100644 llvmraytracing/test/dx/paq-hit-attribute-size.ll
 delete mode 100644 llvmraytracing/test/dx/payload-caller-in-paq.ll
 delete mode 100644 llvmraytracing/test/dx/payload-caller-in-paq.ll.hlsl
 delete mode 100644 llvmraytracing/test/dx/payload-save-registers.ll
 delete mode 100644 llvmraytracing/test/dx/payload-save-registers.ll.hlsl
 delete mode 100644 llvmraytracing/test/dx/remat-indirect-load.ll
 delete mode 100644 llvmraytracing/test/dx/remove-types-metadata.ll
 delete mode 100644 llvmraytracing/test/dx/remove-unused-declarations.ll
 delete mode 100644 llvmraytracing/test/dx/specialize-driver-shaders/analysis.ll
 delete mode 100644 llvmraytracing/test/dx/specialize-driver-shaders/lower-rt-pipeline-args.ll
 delete mode 100644 llvmraytracing/test/dx/specialize-driver-shaders/specialization.ll
 delete mode 100644 llvmraytracing/test/dx/stats-report-sizes.ll
 delete mode 100644 llvmraytracing/test/dx/unnamed-type-intrinsics.ll
 delete mode 100644 llvmraytracing/test/dx/wrong-system-data.ll
 delete mode 100644 llvmraytracing/test/lgccps/CpsLowering/cps-stack-lowering-dxil-global.ll
 delete mode 100644 llvmraytracing/test/lgccps/CpsLowering/cps-stack-lowering-dxil-scratch.ll
 delete mode 100644 llvmraytracing/test/lgccps/lower-traversal.ll
 create mode 100644 llvmraytracing/test/lgccps/traversal-padding-hitattr-size.ll

diff --git a/cmake/CompilerFlags.cmake b/cmake/CompilerFlags.cmake
index d28bc4ca11..258bd173fd 100644
--- a/cmake/CompilerFlags.cmake
+++ b/cmake/CompilerFlags.cmake
@@ -1,3 +1,28 @@
+##
+ #######################################################################################################################
+ #
+ #  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ #
+ #  Permission is hereby granted, free of charge, to any person obtaining a copy
+ #  of this software and associated documentation files (the "Software"), to
+ #  deal in the Software without restriction, including without limitation the
+ #  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ #  sell copies of the Software, and to permit persons to whom the Software is
+ #  furnished to do so, subject to the following conditions:
+ #
+ #  The above copyright notice and this permission notice shall be included in all
+ #  copies or substantial portions of the Software.
+ #
+ #  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ #  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ #  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ #  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ #  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ #  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ #  IN THE SOFTWARE.
+ #
+ #######################################################################################################################
+
 function(set_compiler_options PROJECT_NAME ENABLE_WERROR)
     target_compile_features(${PROJECT_NAME} PUBLIC cxx_std_20)
     set_target_properties(${PROJECT_NAME} PROPERTIES
diff --git a/cmake/findllvm.cmake b/cmake/findllvm.cmake
index fbf38f0d07..bb930a0905 100644
--- a/cmake/findllvm.cmake
+++ b/cmake/findllvm.cmake
@@ -27,11 +27,11 @@ if (NOT LLPC_LLVM_SRC_PATH)
     # Find LLVM source. Allow client driver to override using its own name for overlay builds.
     set(DEFAULT_LLPC_LLVM_SRC_PATH ${XGL_LLVM_SRC_PATH})
     if (NOT DEFAULT_LLPC_LLVM_SRC_PATH)
-        if(EXISTS ${CMAKE_CURRENT_LIST_DIR}/../../../imported/llvm-project/llvm)
-            set(DEFAULT_LLPC_LLVM_SRC_PATH ${CMAKE_CURRENT_LIST_DIR}/../../../imported/llvm-project/llvm)
+        if(EXISTS ${CMAKE_CURRENT_LIST_DIR}/../imported/llvm-project/llvm)
+            set(DEFAULT_LLPC_LLVM_SRC_PATH ${CMAKE_CURRENT_LIST_DIR}/../imported/llvm-project/llvm)
         elseif(EXISTS ${CMAKE_CURRENT_LIST_DIR}/../../llvm-project/llvm)
             set(DEFAULT_LLPC_LLVM_SRC_PATH ${CMAKE_CURRENT_LIST_DIR}/../../llvm-project/llvm)
         endif()
     endif()
-    set(LLPC_LLVM_SRC_PATH ${DEFAULT_LLPC_LLVM_SRC_PATH} CACHE PATH "Specify the path to LLVM.")
+    set(LLPC_LLVM_SRC_PATH ${DEFAULT_LLPC_LLVM_SRC_PATH} CACHE PATH "Specify the path to LLVM." FORCE)
 endif()
diff --git a/cmake/llvm.cmake b/cmake/llvm.cmake
index ee26b47a75..640a7fbe71 100644
--- a/cmake/llvm.cmake
+++ b/cmake/llvm.cmake
@@ -56,6 +56,12 @@ if(CMAKE_BUILD_TYPE_DEBUG)
         # See: llvm-project/llvm/cmake/modules/CrossCompile.cmake
         set(CROSS_TOOLCHAIN_FLAGS_NATIVE "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}" CACHE STRING
             "Toolchain flags for native build" FORCE)
+
+        # Fail early to avoid the dreaded -ologo error.
+        if(CMAKE_VERSION VERSION_LESS "3.27")
+            message(FATAL_ERROR "Using LLVM_OPTIMIZED_TABLEGEN in a Debug build requires CMake 3.27 or higher."
+                                " The current CMake version is ${CMAKE_VERSION}.")
+        endif()
     endif()
 #endif
 endif()
diff --git a/compilerutils/CMakeLists.txt b/compilerutils/CMakeLists.txt
index 8ea17ab716..f1830d0154 100644
--- a/compilerutils/CMakeLists.txt
+++ b/compilerutils/CMakeLists.txt
@@ -1,3 +1,28 @@
+##
+ #######################################################################################################################
+ #
+ #  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ #
+ #  Permission is hereby granted, free of charge, to any person obtaining a copy
+ #  of this software and associated documentation files (the "Software"), to
+ #  deal in the Software without restriction, including without limitation the
+ #  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ #  sell copies of the Software, and to permit persons to whom the Software is
+ #  furnished to do so, subject to the following conditions:
+ #
+ #  The above copyright notice and this permission notice shall be included in all
+ #  copies or substantial portions of the Software.
+ #
+ #  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ #  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ #  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ #  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ #  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ #  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ #  IN THE SOFTWARE.
+ #
+ #######################################################################################################################
+
 cmake_minimum_required(VERSION 3.13.4)
 
 project(CompilerUtils LANGUAGES CXX)
diff --git a/compilerutils/include/compilerutils/CompilerUtils.h b/compilerutils/include/compilerutils/CompilerUtils.h
index c66576bcc2..3e3d762527 100644
--- a/compilerutils/include/compilerutils/CompilerUtils.h
+++ b/compilerutils/include/compilerutils/CompilerUtils.h
@@ -156,7 +156,7 @@ class CrossModuleInliner {
 // Writes instructions which are redundant after the replacement into
 // the given ToBeRemoved vector.
 // The caller has to handle the erasure afterwards.
-void replaceAllPointerUses(llvm::IRBuilder<> *builder, llvm::Value *oldPointerValue, llvm::Value *newPointerValue,
+void replaceAllPointerUses(llvm::Value *oldPointerValue, llvm::Value *newPointerValue,
                            llvm::SmallVectorImpl<llvm::Instruction *> &toBeRemoved);
 
 // Create a GEP if idx is non-null, otherwise return the pointer.
@@ -169,13 +169,6 @@ llvm::Value *simplifyingCreateConstInBoundsGEP1_32(llvm::IRBuilder<> &builder, l
 
 namespace llvm {
 
-// Replacement for PointerType::getWithSamePointeeType that works with new LLVM.
-// Returns a typed pointer type if the pointer type is typed.
-//
-// TODO: Remove this as soon as all internal users of opaque pointers have been
-//       fixed.
-PointerType *getWithSamePointeeType(PointerType *ptrTy, unsigned addressSpace);
-
 /// Free-standing helpers.
 
 // Helper to visit all calls of a function.
diff --git a/compilerutils/include/compilerutils/ModuleBunch.h b/compilerutils/include/compilerutils/ModuleBunch.h
index 2bfe8cabcd..3f1a7a0f22 100644
--- a/compilerutils/include/compilerutils/ModuleBunch.h
+++ b/compilerutils/include/compilerutils/ModuleBunch.h
@@ -144,7 +144,7 @@ class ModuleBunchToModulePassAdaptor : public PassInfoMixin<ModuleBunchToModuleP
 
   /// Construct with a function that returns a pass. It can then parallelize compilation by calling
   /// the function once for each parallel thread.
-  explicit ModuleBunchToModulePassAdaptor(function_ref<std::unique_ptr<PassConceptT>()> PassMaker,
+  explicit ModuleBunchToModulePassAdaptor(std::function<std::unique_ptr<PassConceptT>()> PassMaker,
                                           bool EagerlyInvalidate = false)
       : PassMaker(PassMaker), EagerlyInvalidate(EagerlyInvalidate) {}
 
@@ -160,7 +160,7 @@ class ModuleBunchToModulePassAdaptor : public PassInfoMixin<ModuleBunchToModuleP
 
 private:
   std::unique_ptr<PassConceptT> Pass;
-  function_ref<std::unique_ptr<PassConceptT>()> PassMaker;
+  std::function<std::unique_ptr<PassConceptT>()> PassMaker;
   bool EagerlyInvalidate;
 };
 
diff --git a/compilerutils/lib/CompilerUtils.cpp b/compilerutils/lib/CompilerUtils.cpp
index c301c95a13..80ebc4be12 100644
--- a/compilerutils/lib/CompilerUtils.cpp
+++ b/compilerutils/lib/CompilerUtils.cpp
@@ -276,7 +276,7 @@ Value *CrossModuleInliner::CrossModuleValueMaterializer::materialize(Value *v) {
     auto InsertToMappedTypes = [&mappedTypes](Type *sourceType, Type *copiedType) {
       assert((sourceType != nullptr) && (copiedType != nullptr));
       if (sourceType != copiedType) {
-        auto found = mappedTypes.insert(std::make_pair(sourceType, copiedType));
+        [[maybe_unused]] auto found = mappedTypes.insert(std::make_pair(sourceType, copiedType));
         assert((found.second || copiedType == found.first->second) && "Inconsistent type mapping");
       }
     };
@@ -527,10 +527,6 @@ std::string CrossModuleInliner::getCrossModuleName(GlobalValue &gv) {
   return (Twine(gv.getName()) + ".cloned." + gv.getParent()->getName()).str();
 }
 
-PointerType *llvm::getWithSamePointeeType(PointerType *ptrTy, unsigned addressSpace) {
-  return PointerType::get(ptrTy->getContext(), addressSpace);
-}
-
 void CrossModuleInliner::checkTargetModule(llvm::Module &targetModule) {
   if (impl->targetMod == nullptr)
     impl->targetMod = &targetModule;
@@ -538,7 +534,7 @@ void CrossModuleInliner::checkTargetModule(llvm::Module &targetModule) {
     assert(impl->targetMod == &targetModule);
 }
 
-void CompilerUtils::replaceAllPointerUses(IRBuilder<> *builder, Value *oldPointerValue, Value *newPointerValue,
+void CompilerUtils::replaceAllPointerUses(Value *oldPointerValue, Value *newPointerValue,
                                           SmallVectorImpl<Instruction *> &toBeRemoved) {
   // Note: The implementation explicitly supports typed pointers, which
   //       complicates some of the code below.
@@ -548,23 +544,20 @@ void CompilerUtils::replaceAllPointerUses(IRBuilder<> *builder, Value *oldPointe
   (void)oldPtrTy;
   PointerType *newPtrTy = cast<PointerType>(newPointerValue->getType());
   unsigned newAS = newPtrTy->getAddressSpace();
-  assert(newAS != oldPtrTy->getAddressSpace());
-  assert(getWithSamePointeeType(oldPtrTy, newAS) == newPtrTy);
+
+  // If a change of address space is not necessary then simply replace uses.
+  if (newAS == oldPtrTy->getAddressSpace()) {
+    oldPointerValue->replaceAllUsesWith(newPointerValue);
+    return;
+  }
+
+  // Propagate a change of address space by traversing through the users and setup the addrspace.
 
   oldPointerValue->mutateType(newPtrTy);
 
-  // Traverse through the users and setup the addrspace
   SmallVector<Use *> worklist(make_pointer_range(oldPointerValue->uses()));
   oldPointerValue->replaceAllUsesWith(newPointerValue);
 
-  // Given a pointer type, get a pointer with the same pointee type (possibly
-  // opaque) as the given type that uses the newAS address space.
-  auto getMutatedPtrTy = [newAS](Type *ty) {
-    PointerType *ptrTy = cast<PointerType>(ty);
-    // Support typed pointers:
-    return getWithSamePointeeType(ptrTy, newAS);
-  };
-
 #ifndef NDEBUG
   DenseSet<Value *> PhiElems;
 #endif
@@ -620,7 +613,7 @@ void CompilerUtils::replaceAllPointerUses(IRBuilder<> *builder, Value *oldPointe
       // This can happen with typed pointers
       assert(cast<BitCastOperator>(inst)->getSrcTy()->isPointerTy() &&
              cast<BitCastOperator>(inst)->getDestTy()->isPointerTy());
-      inst->mutateType(getMutatedPtrTy(inst->getType()));
+      inst->mutateType(newPtrTy);
       break;
     }
     case Instruction::AddrSpaceCast:
@@ -628,7 +621,7 @@ void CompilerUtils::replaceAllPointerUses(IRBuilder<> *builder, Value *oldPointe
       assert(inst->getOperand(0)->getType()->getPointerAddressSpace() == newAS);
       // Push the correct users before RAUW.
       worklist.append(usesRange.begin(), usesRange.end());
-      inst->mutateType(getMutatedPtrTy(inst->getType()));
+      inst->mutateType(newPtrTy);
       // Since we are mutating the address spaces of users as well,
       // we can just use the (already mutated) cast operand.
       inst->replaceAllUsesWith(inst->getOperand(0));
@@ -636,13 +629,13 @@ void CompilerUtils::replaceAllPointerUses(IRBuilder<> *builder, Value *oldPointe
       continue;
     case Instruction::IntToPtr:
     case Instruction::GetElementPtr: {
-      inst->mutateType(getMutatedPtrTy(inst->getType()));
+      inst->mutateType(newPtrTy);
       break;
     }
     case Instruction::Select: {
       auto *oldType = inst->getType();
       if (oldType->isPointerTy()) {
-        Type *newType = getMutatedPtrTy(oldType);
+        Type *newType = newPtrTy;
         // No further processing if the type has the correct pointer type
         if (newType == oldType)
           continue;
@@ -666,7 +659,7 @@ void CompilerUtils::replaceAllPointerUses(IRBuilder<> *builder, Value *oldPointe
         }
 #endif
 
-        Type *newType = getMutatedPtrTy(oldType);
+        Type *newType = newPtrTy;
         // No further processing if the type has the correct pointer type
         if (newType == oldType)
           continue;
diff --git a/compilerutils/plugin/CMakeLists.txt b/compilerutils/plugin/CMakeLists.txt
index b6dad1be9a..2ec5a1291c 100644
--- a/compilerutils/plugin/CMakeLists.txt
+++ b/compilerutils/plugin/CMakeLists.txt
@@ -1,3 +1,28 @@
+##
+ #######################################################################################################################
+ #
+ #  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ #
+ #  Permission is hereby granted, free of charge, to any person obtaining a copy
+ #  of this software and associated documentation files (the "Software"), to
+ #  deal in the Software without restriction, including without limitation the
+ #  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ #  sell copies of the Software, and to permit persons to whom the Software is
+ #  furnished to do so, subject to the following conditions:
+ #
+ #  The above copyright notice and this permission notice shall be included in all
+ #  copies or substantial portions of the Software.
+ #
+ #  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ #  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ #  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ #  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ #  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ #  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ #  IN THE SOFTWARE.
+ #
+ #######################################################################################################################
+
 set(LLVM_COMPILERUTILSPLUGIN_LINK_INTO_TOOLS ON CACHE BOOL "Link plugin into tools" FORCE)
 
 add_llvm_pass_plugin(CompilerUtilsPlugin
diff --git a/compilerutils/test/CMakeLists.txt b/compilerutils/test/CMakeLists.txt
index 8211af2112..3033c8910b 100644
--- a/compilerutils/test/CMakeLists.txt
+++ b/compilerutils/test/CMakeLists.txt
@@ -1,3 +1,28 @@
+##
+ #######################################################################################################################
+ #
+ #  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ #
+ #  Permission is hereby granted, free of charge, to any person obtaining a copy
+ #  of this software and associated documentation files (the "Software"), to
+ #  deal in the Software without restriction, including without limitation the
+ #  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ #  sell copies of the Software, and to permit persons to whom the Software is
+ #  furnished to do so, subject to the following conditions:
+ #
+ #  The above copyright notice and this permission notice shall be included in all
+ #  copies or substantial portions of the Software.
+ #
+ #  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ #  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ #  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ #  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ #  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ #  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ #  IN THE SOFTWARE.
+ #
+ #######################################################################################################################
+
 set(COMPILERUTILS_TEST_DEPENDS cross-module-inline FileCheck count not opt)
 add_custom_target(compilerutils-test-depends DEPENDS ${COMPILERUTILS_TEST_DEPENDS})
 set_target_properties(compilerutils-test-depends PROPERTIES FOLDER "Tests")
diff --git a/compilerutils/test/dxil-to-llvm/simple-i1-vec.dxil b/compilerutils/test/dxil-to-llvm/simple-i1-vec.dxil
deleted file mode 100644
index 1bf2fa9f61..0000000000
--- a/compilerutils/test/dxil-to-llvm/simple-i1-vec.dxil
+++ /dev/null
@@ -1,129 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
-; RUN: opt -passes="dxil-to-llvm,lint" --verify-each --lint-abort-on-error -S %s | FileCheck %s
-target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
-target triple = "dxil-ms-dx"
-
-declare void @use32(i32)
-declare i32 @def32()
-declare void @use1(i1)
-declare i1 @def1()
-
-%simple.struct = type { <2 x i1> }
-; Check that <2 x i1> is replaced by <2 x i32> in the struct:
-; CHECK-NOT: %simple.struct = type { <2 x i1> }
-; CHECK: %simple.struct.0 = type { <2 x i32> }
-; CHECK-NOT: %simple.struct = type { <2 x i1> }
-
-define void @test_vec_alloca() {
-; CHECK-LABEL: define {{[^@]+}}@test_vec_alloca() {
-; CHECK-NEXT:    [[VEC_ALLOCA:%.*]] = alloca <2 x i32>, align 8
-; CHECK-NEXT:    [[I1_1:%.*]] = call i1 @def1()
-; CHECK-NEXT:    [[I1_2:%.*]] = call i1 @def1()
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i1 [[I1_1]] to i32
-; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <2 x i32> undef, i32 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[I1_2]] to i32
-; CHECK-NEXT:    [[VEC_12:%.*]] = insertelement <2 x i32> [[VEC1]], i32 [[TMP2]], i32 1
-; CHECK-NEXT:    store <2 x i32> [[VEC_12]], ptr [[VEC_ALLOCA]], align 8
-; CHECK-NEXT:    [[VEC_I32_LOAD:%.*]] = load i32, ptr [[VEC_ALLOCA]], align 4
-; CHECK-NEXT:    call void @use32(i32 [[VEC_I32_LOAD]])
-; CHECK-NEXT:    ret void
-;
-  %vec.alloca = alloca <2 x i1>, align 1
-  %i1.1 = call i1 @def1()
-  %i1.2 = call i1 @def1()
-  %vec = insertelement <2 x i1> undef, i1 %i1.1, i32 0
-  %vec.1 = insertelement <2 x i1> %vec, i1 %i1.2, i32 1
-  store <2 x i1> %vec.1, ptr %vec.alloca
-  %vec.i32.load = load i32, ptr %vec.alloca, align 4
-  call void @use32(i32 %vec.i32.load)
-  ret void
-}
-
-
-define void @test_vec_struct_alloca() {
-; CHECK-LABEL: define {{[^@]+}}@test_vec_struct_alloca() {
-; CHECK-NEXT:    [[VEC_ALLOCA:%.*]] = alloca [[SIMPLE_STRUCT_0:%.*]], align 8
-; CHECK-NEXT:    [[I1_1:%.*]] = call i1 @def1()
-; CHECK-NEXT:    [[I1_2:%.*]] = call i1 @def1()
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i1 [[I1_1]] to i32
-; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <2 x i32> undef, i32 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[I1_2]] to i32
-; CHECK-NEXT:    [[VEC_12:%.*]] = insertelement <2 x i32> [[VEC1]], i32 [[TMP2]], i32 1
-; CHECK-NEXT:    [[STRUCT:%.*]] = insertvalue [[SIMPLE_STRUCT_0]] poison, <2 x i32> [[VEC_12]], 0
-; CHECK-NEXT:    store [[SIMPLE_STRUCT_0]] [[STRUCT]], ptr [[VEC_ALLOCA]], align 8
-; CHECK-NEXT:    [[VEC_I32_LOAD:%.*]] = load i32, ptr [[VEC_ALLOCA]], align 4
-; CHECK-NEXT:    call void @use32(i32 [[VEC_I32_LOAD]])
-; CHECK-NEXT:    ret void
-;
-  %vec.alloca = alloca %simple.struct, align 1
-  %i1.1 = call i1 @def1()
-  %i1.2 = call i1 @def1()
-  %vec = insertelement <2 x i1> undef, i1 %i1.1, i32 0
-  %vec.1 = insertelement <2 x i1> %vec, i1 %i1.2, i32 1
-  %struct = insertvalue %simple.struct poison, <2 x i1> %vec.1, 0
-  store %simple.struct %struct, ptr %vec.alloca
-  %vec.i32.load = load i32, ptr %vec.alloca, align 4
-  call void @use32(i32 %vec.i32.load)
-  ret void
-}
-
-; Only check mutating function arguments. Mutating return types is not yet supported and isn't required for now.
-define i1 @test_argument(<7 x i1> %arg) {
-; CHECK-LABEL: define {{[^@]+}}@test_argument
-; CHECK-SAME: (<7 x i32> [[ARG:%.*]]) {
-; CHECK-NEXT:    [[VAL1:%.*]] = extractelement <7 x i32> [[ARG]], i32 3
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[VAL1]] to i1
-; CHECK-NEXT:    ret i1 [[TMP1]]
-;
-  %val = extractelement <7 x i1> %arg, i32 3
-  ret i1 %val
-}
-
-define i1 @test_struct_gep(ptr %arg, i32 %index) {
-; CHECK-LABEL: define {{[^@]+}}@test_struct_gep
-; CHECK-SAME: (ptr [[ARG:%.*]], i32 [[INDEX:%.*]]) {
-; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr [[SIMPLE_STRUCT_0:%.*]], ptr [[ARG]], i32 [[INDEX]], i32 0
-; CHECK-NEXT:    [[VEC:%.*]] = load <2 x i32>, ptr [[PTR1]], align 8
-; CHECK-NEXT:    [[RES2:%.*]] = extractelement <2 x i32> [[VEC]], i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[RES2]] to i1
-; CHECK-NEXT:    ret i1 [[TMP1]]
-;
-  %ptr = getelementptr %simple.struct, ptr %arg, i32 %index, i32 0
-  %vec = load <2 x i1>, ptr %ptr
-  %res = extractelement <2 x i1> %vec, i32 1
-  ret i1 %res
-}
-
-define i1 @test_shufflevector(<2 x i1> %args.0, <2 x i1> %args.1) {
-; CHECK-LABEL: define {{[^@]+}}@test_shufflevector
-; CHECK-SAME: (<2 x i32> [[ARGS_0:%.*]], <2 x i32> [[ARGS_1:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[ARGS_0]], <2 x i32> [[ARGS_1]], <1 x i32> <i32 1>
-; CHECK-NEXT:    [[RES2:%.*]] = extractelement <1 x i32> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[RES2]] to i1
-; CHECK-NEXT:    ret i1 [[TMP1]]
-;
-  %tmp = shufflevector <2 x i1> %args.0, <2 x i1> %args.1, <1 x i32> <i32 1>
-  %res = extractelement <1 x i1> %tmp, i32 0
-  ret i1 %res
-}
-
-define void @test_pointee_metadata(<7 x i1>, ptr) !types !1 {
-; CHECK-LABEL: define {{[^@]+}}@test_pointee_metadata
-; CHECK-SAME: (<7 x i32> [[TMP0:%.*]], ptr [[TMP1:%.*]]) !types [[META2:![0-9]+]] {
-; CHECK-NEXT:    ret void
-;
-  ret void
-}
-
-!named = !{!10, !11}
-
-!1 = !{!"function", !"void", <7 x i1> poison, !2}
-!2 = !{i32 0, %simple.struct poison}
-!10 = !{<1 x i1> undef}
-!11 = !{<3 x i1> poison}
-;.
-; CHECK: [[META0:![0-9]+]] = !{<1 x i32> undef}
-; CHECK: [[META1:![0-9]+]] = !{<3 x i32> poison}
-; CHECK: [[META2]] = !{!"function", !"void", <7 x i32> poison, [[META3:![0-9]+]]}
-; CHECK: [[META3]] = !{i32 0, %simple.struct.0 poison}
-;.
diff --git a/compilerutils/test/value-origin-tracking/basic-tests.ll b/compilerutils/test/value-origin-tracking/basic-tests.ll
index c77587df09..66e112e6b0 100644
--- a/compilerutils/test/value-origin-tracking/basic-tests.ll
+++ b/compilerutils/test/value-origin-tracking/basic-tests.ll
@@ -53,18 +53,18 @@ define void @testConstantVector() {
 ; CHECK: (<9 x i8> zeroinitializer): Constant: 0x0; Constant: 0x0; Constant: 0x0
   call void @analyze(<9 x i8> zeroinitializer)
 
-; CHECK: (<1 x i32> <i32 -559038737>): Constant: 0xdeadbeef
+; CHECK: (<1 x i32> {{(splat \(i32 \-559038737\))|(><i32 -559038737>)}}): Constant: 0xdeadbeef
   call void @analyze(<1 x i32> <i32 u0xdeadbeef>)
 
 ; CHECK: (<4 x i8> <i8 1, i8 2, i8 3, i8 4>): Constant: 0x4030201
   call void @analyze(<4 x i8> <i8 1, i8 2, i8 3, i8 4>)
 
-; CHECK: (<1 x float> <float 1.250000e-01>): Constant: 0x3e000000
+; CHECK: (<1 x float> {{(splat \(float 1\.250000e\-01\))|(<float 1\.250000e\-01>)}}): Constant: 0x3e000000
   call void @analyze(<1 x float> <float 1.250000e-01>)
 
 ; computeKnownBits only supports integer vectors, and our
 ; handling doesn't support smaller-than-slice element types.
-; CHECK: (<1 x half> <half 0xH1234>): Dynamic
+; CHECK: (<1 x half> {{(splat \(half 0xH1234\))|(<half 0xH1234>)}}): Dynamic
   call void @analyze(<1 x half> <half 0xH1234>)
 
 ; CHECK: (<4 x float> <float 0.000000e+00, float 2.560000e+02, float 0.000000e+00, float undef>): Constant: 0x0; Constant: 0x43800000; Constant: 0x0; UndefOrPoison
diff --git a/docs/DdnRelocatableShaderElf.md b/docs/DdnRelocatableShaderElf.md
index aa2c28cd98..ee5ec5cefe 100644
--- a/docs/DdnRelocatableShaderElf.md
+++ b/docs/DdnRelocatableShaderElf.md
@@ -121,8 +121,8 @@ state are used to generate all of the color export instructions and they are
 appended to the end of the unlinked shader. See the `ColorExportShader` class in
 [GlueShader.cpp](../lgc/elfLinker/GlueShader.cpp) for the generation of the
 epilogue. To see how the unlinked shader handles color export see the
-`LowerFragColorExport` pass in
-[FragColorExport.cpp](../lgc/patch/FragColorExport.cpp).
+`LowerFragmentColorExport` pass in
+[FragmentColorExport.cpp](../lgc/patch/FragmentColorExport.cpp).
 
 These epilogues and prologues mean that unlinked shaders do not have to depend
 on the format of the vertex input attributes and color exports.
diff --git a/include/vkgcDefs.h b/include/vkgcDefs.h
index 01497ccec7..44fbb1280b 100644
--- a/include/vkgcDefs.h
+++ b/include/vkgcDefs.h
@@ -76,7 +76,7 @@
 namespace Vkgc {
 
 static const unsigned Version = LLPC_INTERFACE_MAJOR_VERSION;
-static const unsigned InternalDescriptorSetId = static_cast<unsigned>(-1);
+static const unsigned InternalDescriptorSetId = 0xFFFFFFF0;
 static const unsigned MaxVertexAttribs = 64;
 static const unsigned MaxVertexBindings = 64;
 static const unsigned MaxColorTargets = 8;
@@ -541,6 +541,7 @@ struct PipelineOptions {
   bool optimizePointSizeWrite;        ///< If set, the write of PointSize in the last vertex processing stage will be
                                       ///< eliminated if the write value is 1.0.
   CompileConstInfo *compileConstInfo; ///< Compile time constant data.
+  unsigned reserved22;
 };
 
 /// Prototype of allocator for output data buffer, used in shader-specific operations.
@@ -939,6 +940,15 @@ struct PipelineShaderOptions {
 
   /// Indicate whether the vertex shader is used by transform pipeline
   bool enableTransformShader;
+
+  /// Application workaround: force underflow prevention for log and pow calls
+  /// usually required for shaders that are intolerant of this when valid math
+  /// optimisations are applied.
+  bool forceUnderflowPrevention;
+
+  /// Force scope for memory barrier (0 - do not force, nonzero - value of Scope enumeration from SPIR-V headers with
+  /// the exception of CrossDevice that cannot be set at all).
+  unsigned forceMemoryBarrierScope;
 };
 
 /// Represents YCbCr sampler meta data in resource descriptor
@@ -1050,7 +1060,12 @@ constexpr unsigned UberFetchShaderAttribMaskComponent2 = 0x0040000u;
 constexpr unsigned UberFetchShaderAttribMaskComponent3 = 0x0080000u;
 constexpr unsigned UberFetchShaderAttribMaskIsBgra = 0x0100000u;
 
-/// Represents the bit field info of struct BilUberFetchShaderAttribInfo
+// OpenGL internal vertex input rate
+enum VKInternalVertexInputRate {
+  VK_VERTEX_INPUT_RATE_PER_DRAW_PER_VERTEX = 0x10,   // Vertex input rate per draw and per vertex
+  VK_VERTEX_INPUT_RATE_PER_DRAW_PER_INSTANCE = 0x11, // Vertex input rate per draw and per instance
+  VK_VERTEX_INPUT_RATE_PER_DRAW = 0x12,              // Vertex input rate per draw
+};
 
 // OpenGL extended vertex attribute format
 typedef enum VKInternalExtFormat {
@@ -1430,6 +1445,8 @@ struct ComputePipelineBuildInfo {
   const void *pClientMetadata;     ///< Pointer to (optional) client-defined data to be stored inside the ELF
   size_t clientMetadataSize;       ///< Size (in bytes) of the client-defined data
   UniformConstantMap *pUniformMap; ///< Pointer to the uniform constants map
+  GraphicsPipelineBuildInfo *transformGraphicsPipeline; ///< For OpenGL: Graphics pipeline build info holding a vertex
+                                                        ///< shader that can be invoked by the compute shader
 };
 
 /// Represents output of building a ray tracing pipeline.
@@ -1466,15 +1483,23 @@ struct RayTracingPipelineBuildInfo {
   unsigned pipelineLibStageMask; ///< Pipeline library stage mask
   //@}
 
-  unsigned payloadSizeMaxInLib;   ///< Pipeline library maxPayloadSize
-  unsigned attributeSizeMaxInLib; ///< Pipeline library maxAttributeSize
-  bool isReplay;                  ///< Pipeline is created for replaying
-  const void *pClientMetadata;    ///< Pointer to (optional) client-defined data to be
-                                  ///  stored inside the ELF
-  size_t clientMetadataSize;      ///< Size (in bytes) of the client-defined data
-  unsigned cpsFlags;              ///< Cps feature flags
-  GpurtOption *pGpurtOptions;     ///< Array of GPURT options
-  unsigned gpurtOptionCount;      ///< Number of GPURT options
+  unsigned payloadSizeMaxInLib;     ///< Pipeline library maxPayloadSize
+  unsigned attributeSizeMaxInLib;   ///< Pipeline library maxAttributeSize
+  bool isReplay;                    ///< Pipeline is created for replaying
+  const void *pClientMetadata;      ///< Pointer to (optional) client-defined data to be
+                                    ///  stored inside the ELF
+  size_t clientMetadataSize;        ///< Size (in bytes) of the client-defined data
+  unsigned cpsFlags;                ///< Cps feature flags
+  GpurtOption *pGpurtOptions;       ///< Array of GPURT options
+  unsigned gpurtOptionCount;        ///< Number of GPURT options
+  bool rtIgnoreDeclaredPayloadSize; ///< Ignore the declared payload size in the shader to address issues with Proton.
+                                    ///  Proton games pass a dynamic maxPipelineRayPayloadSize into the API.
+                                    ///  This dynamic size is used by the pipeline instead of the declared payload size
+                                    ///  in the shader, it may be smaller than the declared size.
+                                    ///  This behavior of Proton is incorrect according to the latest clarification in
+                                    ///  the vulkan spec https://gitlab.khronos.org/vulkan/vulkan/-/issues/4080
+                                    ///  However, we need to support this incorrect behavior for Proton before Proton
+                                    ///  fixes it.
 };
 
 /// Ray tracing max shader name length
diff --git a/lgc/CMakeLists.txt b/lgc/CMakeLists.txt
index f7a81abd85..f0a3a07420 100644
--- a/lgc/CMakeLists.txt
+++ b/lgc/CMakeLists.txt
@@ -177,6 +177,7 @@ target_sources(LLVMlgc PRIVATE
     patch/LowerInOut.cpp
     patch/LowerInvariantLoads.cpp
     patch/IncludeLlvmIr.cpp
+    patch/InitializeWorkgroupMemory.cpp
     patch/ScalarizeLoads.cpp
     patch/LowerMulDx9Zero.cpp
     patch/AddLoopMetadata.cpp
@@ -187,7 +188,6 @@ target_sources(LLVMlgc PRIVATE
     patch/CollectResourceUsage.cpp
     patch/SetupTargetFeatures.cpp
     patch/PassthroughHullShader.cpp
-    patch/PatchInitializeWorkgroupMemory.cpp
     patch/ApplyWorkarounds.cpp
     patch/ShaderInputs.cpp
     patch/ShaderMerger.cpp
@@ -216,6 +216,7 @@ target_sources(LLVMlgc PRIVATE
     include/lgc/patch/FragmentColorExport.h
     include/lgc/patch/GenerateCopyShader.h
     include/lgc/patch/IncludeLlvmIr.h
+    include/lgc/patch/InitializeWorkgroupMemory.h
     include/lgc/patch/LgcLowering.h
     include/lgc/patch/LowerBufferOperations.h
     include/lgc/patch/LowerCooperativeMatrix.h
@@ -230,7 +231,6 @@ target_sources(LLVMlgc PRIVATE
     include/lgc/patch/LowerSubgroupOps.h
     include/lgc/patch/MutateEntryPoint.h
     include/lgc/patch/PassthroughHullShader.h
-    include/lgc/patch/PatchInitializeWorkgroupMemory.h
     include/lgc/patch/PeepholeOptimization.h
     include/lgc/patch/PreparePipelineAbi.h
     include/lgc/patch/ScalarizeLoads.h
@@ -293,7 +293,6 @@ target_sources(LLVMlgc PRIVATE
 target_sources(LLVMlgc PRIVATE
     include/lgc/util/AddressExtender.h
     include/lgc/util/BuilderBase.h
-    include/lgc/util/Debug.h
     include/lgc/util/GfxRegHandler.h
     include/lgc/util/GfxRegHandlerBase.h
     include/lgc/util/Internal.h
@@ -308,6 +307,7 @@ target_sources(LLVMlgc PRIVATE
     interface/lgc/BuiltInDefs.h
     interface/lgc/BuiltIns.h
     interface/lgc/CommonDefs.h
+    interface/lgc/Debug.h
     interface/lgc/Disassembler.h
     interface/lgc/ElfLinker.h
     interface/lgc/EnumIterator.h
diff --git a/lgc/builder/ArithBuilder.cpp b/lgc/builder/ArithBuilder.cpp
index 7a12be6e97..33a745520a 100644
--- a/lgc/builder/ArithBuilder.cpp
+++ b/lgc/builder/ArithBuilder.cpp
@@ -31,6 +31,7 @@
 #include "lgc/builder/BuilderImpl.h"
 #include "lgc/state/PipelineState.h"
 #include "lgc/state/TargetInfo.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include <float.h>
 
@@ -196,6 +197,87 @@ Value *BuilderImpl::CreateFpTruncWithRounding(Value *value, Type *destTy, Roundi
                        instName);
 }
 
+// =====================================================================================================================
+// The conversion between float8 and float32.
+//
+// @param value : Input value
+// @param destTy : Type to convert to
+// @param isBfloat : Whether float8 type is bfloat8
+// @param instName : Name to give instruction(s)
+llvm::Value *BuilderImpl::CreateFp8Convert(llvm::Value *value, llvm::Type *destTy, bool isBfloat,
+                                           const llvm::Twine &instName) {
+  bool isToFloat32 = false;
+  auto srcType = value->getType();
+  if (srcType->isIntOrIntVectorTy(8)) {
+    // must be float8 -> float32
+    assert(destTy->getScalarType()->isFloatTy());
+    isToFloat32 = true;
+  } else {
+    // must be float32 -> float8
+    assert(destTy->isIntOrIntVectorTy(8) && srcType->getScalarType()->isFloatTy());
+  }
+
+  unsigned eleCount = 1;
+  const auto vectorTy = dyn_cast<FixedVectorType>(srcType);
+  if (vectorTy)
+    eleCount = vectorTy->getNumElements();
+
+  Intrinsic::AMDGCNIntrinsics inst;
+  if (isToFloat32) {
+    if (eleCount == 1) {
+      value = CreateZExt(value, getInt32Ty());
+      inst = isBfloat ? Intrinsic::amdgcn_cvt_f32_bf8 : Intrinsic::amdgcn_cvt_f32_fp8;
+      return CreateIntrinsic(getFloatTy(), inst, {value, getInt32(0)});
+    }
+
+    // i8vec4 -> int32
+    value = CreateShuffleVector(value, {0, 1, 2, 3});
+    value = CreateBitCast(value, getInt32Ty());
+
+    Value *ret0 = PoisonValue::get(FixedVectorType::get(getFloatTy(), 2));
+    if (eleCount >= 2) {
+      inst = isBfloat ? Intrinsic::amdgcn_cvt_pk_f32_bf8 : Intrinsic::amdgcn_cvt_pk_f32_fp8;
+      ret0 = CreateIntrinsic(FixedVectorType::get(getFloatTy(), 2), inst, {value, getFalse()});
+    }
+
+    Value *ret1 = PoisonValue::get(FixedVectorType::get(getFloatTy(), 2));
+    if (eleCount >= 3) {
+      inst = isBfloat ? Intrinsic::amdgcn_cvt_pk_f32_bf8 : Intrinsic::amdgcn_cvt_pk_f32_fp8;
+      ret1 = CreateIntrinsic(FixedVectorType::get(getFloatTy(), 2), inst, {value, getTrue()});
+    }
+
+    SmallVector<int> shuffleMask = createSequentialMask(0, eleCount, 0);
+
+    return CreateShuffleVector(ret0, ret1, shuffleMask);
+  }
+
+  // float32 -> float8
+  inst = isBfloat ? Intrinsic::amdgcn_cvt_pk_bf8_f32 : Intrinsic::amdgcn_cvt_pk_fp8_f32;
+  if (eleCount == 1) {
+    value = CreateIntrinsic(getInt32Ty(), inst, {value, value, getFalse()});
+    return CreateTrunc(value, destTy);
+  }
+
+  Value *ret = getInt32(0);
+  if (eleCount >= 2) {
+    Value *elem0 = CreateExtractElement(value, uint64_t(0));
+    Value *elem1 = CreateExtractElement(value, 1);
+    ret = CreateIntrinsic(getInt32Ty(), inst, {elem0, elem1, ret, getFalse()});
+  }
+
+  if (eleCount >= 3) {
+    Value *elem0 = CreateExtractElement(value, 2);
+    Value *elem1 = (eleCount > 3) ? CreateExtractElement(value, 3) : ConstantFP::get(getFloatTy(), 0.0);
+    ret = CreateIntrinsic(getInt32Ty(), inst, {elem0, elem1, ret, getTrue()});
+  }
+
+  ret = CreateBitCast(ret, FixedVectorType::get(getInt8Ty(), 4));
+
+  SmallVector<int> shuffleMask = createSequentialMask(0, eleCount, 0);
+
+  return CreateShuffleVector(ret, shuffleMask);
+}
+
 // =====================================================================================================================
 // Create quantize operation: truncates float (or vector) value to a value that is representable by a half.
 //
diff --git a/lgc/builder/BuilderRecorder.cpp b/lgc/builder/BuilderRecorder.cpp
index 78bb7138b3..250b74e0e4 100644
--- a/lgc/builder/BuilderRecorder.cpp
+++ b/lgc/builder/BuilderRecorder.cpp
@@ -76,6 +76,8 @@ StringRef BuilderRecorder::getCallName(BuilderOpcode opcode) {
     return "cube.face.index";
   case BuilderOpcode::FpTruncWithRounding:
     return "fp.trunc.with.rounding";
+  case BuilderOpcode::Fp8Convert:
+    return "fp8.convert";
   case BuilderOpcode::QuantizeToFp16:
     return "quantize.to.fp16";
   case BuilderOpcode::SMod:
@@ -817,6 +819,18 @@ Value *Builder::CreateFpTruncWithRounding(Value *value, Type *destTy, RoundingMo
                 instName);
 }
 
+// =====================================================================================================================
+// The conversion between float8 and float32.
+//
+// @param value : Input value
+// @param destTy : Type to convert to
+// @param isBfloat : Whether float8 type is bfloat8
+// @param instName : Name to give instruction(s)
+llvm::Value *Builder::CreateFp8Convert(llvm::Value *value, llvm::Type *destTy, bool isBfloat /* = false*/,
+                                       const llvm::Twine &instName /* = ""*/) {
+  return record(BuilderOpcode::Fp8Convert, destTy, {value, getInt1(isBfloat)}, instName);
+}
+
 // =====================================================================================================================
 // Create quantize operation.
 //
@@ -2022,6 +2036,7 @@ Instruction *Builder::record(BuilderOpcode opcode, Type *resultTy, ArrayRef<Valu
     case BuilderOpcode::FDot2:
     case BuilderOpcode::Fma:
     case BuilderOpcode::FpTruncWithRounding:
+    case BuilderOpcode::Fp8Convert:
     case BuilderOpcode::Fract:
     case BuilderOpcode::GetDescPtr:
     case BuilderOpcode::GetDescStride:
diff --git a/lgc/builder/BuilderRecorder.h b/lgc/builder/BuilderRecorder.h
index cecb86a981..07940891b6 100644
--- a/lgc/builder/BuilderRecorder.h
+++ b/lgc/builder/BuilderRecorder.h
@@ -56,6 +56,7 @@ enum BuilderOpcode : unsigned {
   CubeFaceCoord,
   CubeFaceIndex,
   FpTruncWithRounding,
+  Fp8Convert,
   QuantizeToFp16,
   SMod,
   FMod,
diff --git a/lgc/builder/BuilderReplayer.cpp b/lgc/builder/BuilderReplayer.cpp
index b3e34c76bb..2c3348c872 100644
--- a/lgc/builder/BuilderReplayer.cpp
+++ b/lgc/builder/BuilderReplayer.cpp
@@ -192,6 +192,10 @@ Value *BuilderReplayer::processCall(unsigned opcode, CallInst *call) {
     return m_builder->CreateFpTruncWithRounding(args[0], call->getType(), roundingMode);
   }
 
+  case BuilderOpcode::Fp8Convert: {
+    return m_builder->CreateFp8Convert(args[0], call->getType(), cast<ConstantInt>(args[1])->getZExtValue());
+  }
+
   case BuilderOpcode::QuantizeToFp16: {
     return m_builder->CreateQuantizeToFp16(args[0]);
   }
diff --git a/lgc/builder/DescBuilder.cpp b/lgc/builder/DescBuilder.cpp
index 0b1374d0af..e74fbf234b 100644
--- a/lgc/builder/DescBuilder.cpp
+++ b/lgc/builder/DescBuilder.cpp
@@ -119,14 +119,18 @@ Value *BuilderImpl::createBufferDesc(uint64_t descSet, unsigned binding, Value *
     if (return64Address)
       return desc;
     assert(convertFatPointer);
+    bool globallyCoherent = flags & BufferFlagCoherent;
     if (isCompact) {
       desc = CreateBitCast(desc, getInt64Ty());
       if (stride == 0)
-        desc = create<BufferAddrToPtrOp>(desc);
+        desc = create<BufferAddrToPtrOp>(desc, globallyCoherent);
       else
-        desc = create<StridedBufferAddrAndStrideToPtrOp>(desc, getInt32(stride));
+        desc = create<StridedBufferAddrAndStrideToPtrOp>(desc, getInt32(stride), globallyCoherent);
     } else {
-      desc = create<BufferDescToPtrOp>(desc);
+      if (stride == 0)
+        desc = create<BufferDescToPtrOp>(desc, globallyCoherent);
+      else
+        desc = create<StridedBufferDescToPtrOp>(desc, globallyCoherent);
     }
   } else if (node->concreteType == ResourceNodeType::InlineBuffer) {
     // Handle an inline buffer specially. Get a pointer to it, then expand to a descriptor.
@@ -136,9 +140,9 @@ Value *BuilderImpl::createBufferDesc(uint64_t descSet, unsigned binding, Value *
     assert(convertFatPointer);
     desc = CreatePtrToInt(descPtr, getInt64Ty());
     if (stride == 0)
-      desc = create<BufferAddrToPtrOp>(desc);
+      desc = create<BufferAddrToPtrOp>(desc, false);
     else
-      desc = create<StridedBufferAddrAndStrideToPtrOp>(desc, getInt32(stride));
+      desc = create<StridedBufferAddrAndStrideToPtrOp>(desc, getInt32(stride), false);
   } else {
     ResourceNodeType resType = node->concreteType;
     ResourceNodeType abstractType = node->abstractType;
@@ -167,10 +171,12 @@ Value *BuilderImpl::createBufferDesc(uint64_t descSet, unsigned binding, Value *
     descPtr = CreateBitCast(descPtr, getDescPtrTy());
     if (convertFatPointer) {
       bool forceRawView = flags & BufferFlagForceRawView;
+      bool globallyCoherent = flags & BufferFlagCoherent;
       if (stride == 0)
-        desc = create<BufferLoadDescToPtrOp>(descPtr, forceRawView, isCompact);
+        desc = create<BufferLoadDescToPtrOp>(descPtr, forceRawView, isCompact, globallyCoherent);
       else
-        desc = create<StridedBufferLoadDescToPtrOp>(descPtr, forceRawView, isCompact, getInt32(stride));
+        desc =
+            create<StridedBufferLoadDescToPtrOp>(descPtr, forceRawView, isCompact, globallyCoherent, getInt32(stride));
     } else {
       // Load the descriptor.
       desc = CreateLoad(getDescTy(resType), descPtr);
@@ -379,10 +385,7 @@ Value *BuilderImpl::scalarizeIfUniform(Value *value, bool isNonUniform) {
 //
 // @param desc : The buffer descriptor base to build for the buffer compact descriptor
 // @param stride :  stride for the buffer descriptor to access in index mode
-Value *BuilderImpl::buildBufferCompactDesc(Value *desc, unsigned stride) {
-  // Bitcast the pointer to v2i32
-  desc = CreatePtrToInt(desc, getInt64Ty());
-  desc = CreateBitCast(desc, FixedVectorType::get(getInt32Ty(), 2));
+Value *BuilderImpl::buildBufferCompactDesc(Value *desc, Value *stride) {
   const GfxIpVersion gfxIp = m_pipelineState->getTargetInfo().getGfxIpVersion();
 
   // Extract compact buffer descriptor
@@ -396,14 +399,8 @@ Value *BuilderImpl::buildBufferCompactDesc(Value *desc, unsigned stride) {
     bufDesc = CreateInsertElement(bufDesc, addrLo, uint64_t(0));
 
     // Dword 1
-    SqBufRsrcWord1 sqBufRsrcWord1 = {};
-    sqBufRsrcWord1.bits.baseAddressHi = UINT16_MAX;
-    addrHi = CreateAnd(addrHi, getInt32(sqBufRsrcWord1.u32All));
-    if (stride) {
-      SqBufRsrcWord1 sqBufRsrcWord1Stride = {};
-      sqBufRsrcWord1Stride.bits.stride = stride;
-      addrHi = CreateOr(addrHi, getInt32(sqBufRsrcWord1Stride.u32All));
-    }
+    if (stride)
+      addrHi = CreateOr(addrHi, CreateShl(stride, SqBufRsrcTWord1StrideShift));
     bufDesc = CreateInsertElement(bufDesc, addrHi, 1);
 
     // Dword 2
@@ -422,7 +419,7 @@ Value *BuilderImpl::buildBufferCompactDesc(Value *desc, unsigned stride) {
       sqBufRsrcWord3.gfx10.resourceLevel = 1;
       sqBufRsrcWord3.gfx10.oobSelect = stride ? 3 : 2;
       assert(sqBufRsrcWord3.u32All == 0x21014FAC || sqBufRsrcWord3.u32All == 0x31014FAC);
-    } else if (gfxIp.major >= 11) {
+    } else if (gfxIp.major == 11) {
       sqBufRsrcWord3.gfx11.format = BUF_FORMAT_32_UINT;
       sqBufRsrcWord3.gfx11.oobSelect = stride ? 3 : 2;
       assert(sqBufRsrcWord3.u32All == 0x20014FAC || sqBufRsrcWord3.u32All == 0x30014FAC);
diff --git a/lgc/builder/ImageBuilder.cpp b/lgc/builder/ImageBuilder.cpp
index 4c01853832..86d7bed32d 100644
--- a/lgc/builder/ImageBuilder.cpp
+++ b/lgc/builder/ImageBuilder.cpp
@@ -1643,7 +1643,6 @@ Value *BuilderImpl::CreateImageQuerySize(unsigned dim, unsigned flags, Value *im
 // @param instName : Name to give instruction(s)
 Value *BuilderImpl::CreateImageGetLod(unsigned dim, unsigned flags, Value *imageDesc, Value *samplerDesc, Value *coord,
                                       const Twine &instName) {
-  assert(imageDesc->getType()->isPointerTy() && samplerDesc->getType()->isPointerTy());
   if (isa<PoisonValue>(imageDesc) || isa<PoisonValue>(samplerDesc))
     return PoisonValue::get(FixedVectorType::get(getFloatTy(), 2));
 
diff --git a/lgc/builder/InOutBuilder.cpp b/lgc/builder/InOutBuilder.cpp
index 79aecbc882..9f39a854ea 100644
--- a/lgc/builder/InOutBuilder.cpp
+++ b/lgc/builder/InOutBuilder.cpp
@@ -708,7 +708,10 @@ std::tuple<unsigned, llvm::Value *> BuilderImpl::getInterpModeAndValue(InOutInfo
   if (inputInfo.getInterpMode() == InOutInfo::InterpModeSmooth) {
     if (auxInterpValue) {
       assert(interpLoc == InOutInfo::InterpLocCenter);
-      return {InOutInfo::InterpModeSmooth, evalIjOffsetSmooth(auxInterpValue)};
+      BuiltInKind builtIn = lgc::BuiltInInterpPullMode;
+      markBuiltInInputUsage(builtIn, 0, {});
+      return {InOutInfo::InterpModeSmooth,
+              create<lgc::EvalIjOffsetSmoothOp>(FixedVectorType::get(getFloatTy(), 2), auxInterpValue)};
     }
 
     BuiltInKind builtInId;
@@ -758,51 +761,12 @@ std::tuple<unsigned, llvm::Value *> BuilderImpl::getInterpModeAndValue(InOutInfo
   resUsage->builtInUsage.fs.noperspective = true;
 
   Value *interpValue = readBuiltIn(false, builtInId, {}, nullptr, nullptr, "");
-  if (auxInterpValue)
-    interpValue = adjustIj(interpValue, auxInterpValue);
+  if (auxInterpValue) {
+    interpValue = create<AdjustIjOp>(interpValue->getType(), interpValue, auxInterpValue);
+  }
   return {InOutInfo::InterpModeSmooth, interpValue};
 }
 
-// =====================================================================================================================
-// Evaluate I,J for interpolation: center offset, smooth (perspective) version
-//
-// @param offset : Offset value, <2 x float> or <2 x half>
-Value *BuilderImpl::evalIjOffsetSmooth(Value *offset) {
-  // Get <I/W, J/W, 1/W>
-  Value *pullModel = readBuiltIn(false, BuiltInInterpPullMode, {}, nullptr, nullptr, "");
-  // Adjust each coefficient by offset.
-  Value *adjusted = adjustIj(pullModel, offset);
-  // Extract <I/W, J/W, 1/W> part of that
-  Value *ijDivW = CreateShuffleVector(adjusted, adjusted, ArrayRef<int>{0, 1});
-  Value *rcpW = CreateExtractElement(adjusted, 2);
-  // Get W by making a reciprocal of 1/W
-  Value *w = CreateFDiv(ConstantFP::get(getFloatTy(), 1.0), rcpW);
-  w = CreateVectorSplat(2, w);
-  return CreateFMul(ijDivW, w);
-}
-
-// =====================================================================================================================
-// Adjust I,J values by offset.
-// This adjusts value by its X and Y derivatives times the X and Y components of offset.
-// If value is a vector, this is done component-wise.
-//
-// @param value : Value to adjust, float or vector of float
-// @param offset : Offset to adjust by, <2 x float> or <2 x half>
-Value *BuilderImpl::adjustIj(Value *value, Value *offset) {
-  offset = CreateFPExt(offset, FixedVectorType::get(getFloatTy(), 2));
-  Value *offsetX = CreateExtractElement(offset, uint64_t(0));
-  Value *offsetY = CreateExtractElement(offset, 1);
-  if (auto vecTy = dyn_cast<FixedVectorType>(value->getType())) {
-    offsetX = CreateVectorSplat(vecTy->getNumElements(), offsetX);
-    offsetY = CreateVectorSplat(vecTy->getNumElements(), offsetY);
-  }
-  Value *derivX = CreateDerivative(value, /*isY=*/false, /*isFine=*/true);
-  Value *derivY = CreateDerivative(value, /*isY=*/true, /*isFine=*/true);
-  Value *adjustX = CreateFAdd(value, CreateFMul(derivX, offsetX));
-  Value *adjustY = CreateFAdd(adjustX, CreateFMul(derivY, offsetY));
-  return adjustY;
-}
-
 // =====================================================================================================================
 // Create a write to an XFB (transform feedback / streamout) buffer.
 // The value to write must be a scalar or vector type with no more than four elements.
@@ -904,15 +868,7 @@ Instruction *BuilderImpl::CreateWriteXfbOutput(Value *valueToWrite, bool isBuilt
     }
   }
 
-  // XFB: @lgc.output.export.xfb.%Type%(i32 xfbBuffer, i32 xfbOffset, i32 streamId, %Type% outputValue)
-  SmallVector<Value *, 4> args;
-  std::string instName = lgcName::OutputExportXfb;
-  args.push_back(getInt32(xfbBuffer));
-  args.push_back(xfbOffset);
-  args.push_back(getInt32(streamId));
-  args.push_back(valueToWrite);
-  addTypeMangling(nullptr, args, instName);
-  return CreateNamedCall(instName, getVoidTy(), args, {});
+  return create<WriteXfbOutputOp>(xfbBuffer, cast<ConstantInt>(xfbOffset)->getZExtValue(), streamId, valueToWrite);
 }
 
 // =====================================================================================================================
@@ -2120,7 +2076,6 @@ namespace StageValidMask {
 constexpr const ShaderStageMask C(ShaderStage::Compute);
 constexpr const ShaderStageMask D(ShaderStage::TessEval);
 constexpr const ShaderStageMask H(ShaderStage::TessControl);
-constexpr const ShaderStageMask G(ShaderStage::Geometry);
 constexpr const ShaderStageMask HD({ShaderStage::TessControl, ShaderStage::TessEval});
 constexpr const ShaderStageMask HDG({ShaderStage::TessControl, ShaderStage::TessEval, ShaderStage::Geometry});
 constexpr const ShaderStageMask HDGP({ShaderStage::TessControl, ShaderStage::TessEval, ShaderStage::Geometry,
diff --git a/lgc/builder/MatrixBuilder.cpp b/lgc/builder/MatrixBuilder.cpp
index 555bbc0063..4d63de7ff3 100644
--- a/lgc/builder/MatrixBuilder.cpp
+++ b/lgc/builder/MatrixBuilder.cpp
@@ -378,7 +378,9 @@ Type *BuilderCommon::transCooperativeMatrixElementType(CooperativeMatrixElementT
 //
 // @param elemType : the matrix element type
 // @param layout : the matrix layout
-Type *BuilderCommon::getCooperativeMatrixTy(CooperativeMatrixElementType elemType, CooperativeMatrixLayout layout) {
+// @param kSize : the matrix K size
+Type *BuilderCommon::getCooperativeMatrixTy(CooperativeMatrixElementType elemType, CooperativeMatrixLayout layout,
+                                            unsigned kSize) {
   // Note: the layout currently has no influence on the type. In the long run, we should switch to genuinely opaque
   // types at the LGC level, and parameterize the type using both the element type and the layout.
 
diff --git a/lgc/builder/MiscBuilder.cpp b/lgc/builder/MiscBuilder.cpp
index 3817508142..9b3598110c 100644
--- a/lgc/builder/MiscBuilder.cpp
+++ b/lgc/builder/MiscBuilder.cpp
@@ -28,6 +28,7 @@
  * @brief LLPC source file: implementation of miscellaneous Builder methods
  ***********************************************************************************************************************
  */
+#include "lgc/LgcDialect.h"
 #include "lgc/builder/BuilderImpl.h"
 #include "lgc/state/TargetInfo.h"
 #include "llvm/IR/InlineAsm.h"
@@ -52,15 +53,7 @@ Instruction *BuilderImpl::CreateEmitVertex(unsigned streamId) {
       m_pipelineState->getRasterizerState().rasterStream == streamId)
     m_pipelineState->setVertexStreamActive(streamId);
 
-  // Get GsWaveId
-  std::string callName = lgcName::InputImportBuiltIn;
-  callName += "GsWaveId.i32.i32";
-  Value *gsWaveId = CreateNamedCall(callName, getInt32Ty(), getInt32(BuiltInGsWaveId), {});
-
-  // Do the sendmsg.
-  // [9:8] = stream, [5:4] = 2 (emit), [3:0] = 2 (GS)
-  unsigned msg = (streamId << GsEmitCutStreamIdShift) | GsEmit;
-  return CreateIntrinsic(Intrinsic::amdgcn_s_sendmsg, {}, {getInt32(msg), gsWaveId}, nullptr);
+  return create<GsEmitStreamOp>(streamId);
 }
 
 // =====================================================================================================================
@@ -76,15 +69,7 @@ Instruction *BuilderImpl::CreateEndPrimitive(unsigned streamId) {
       m_pipelineState->getRasterizerState().rasterStream == streamId)
     m_pipelineState->setVertexStreamActive(streamId);
 
-  // Get GsWaveId
-  std::string callName = lgcName::InputImportBuiltIn;
-  callName += "GsWaveId.i32.i32";
-  Value *gsWaveId = CreateNamedCall(callName, getInt32Ty(), getInt32(BuiltInGsWaveId), {});
-
-  // Do the sendmsg.
-  // [9:8] = stream, [5:4] = 1 (cut), [3:0] = 2 (GS)
-  unsigned msg = (streamId << GsEmitCutStreamIdShift) | GsCut;
-  return CreateIntrinsic(Intrinsic::amdgcn_s_sendmsg, {}, {getInt32(msg), gsWaveId}, nullptr);
+  return create<GsCutStreamOp>(streamId);
 }
 
 // =====================================================================================================================
diff --git a/lgc/builder/SubgroupBuilder.cpp b/lgc/builder/SubgroupBuilder.cpp
index e5abd20412..05d34b4b48 100644
--- a/lgc/builder/SubgroupBuilder.cpp
+++ b/lgc/builder/SubgroupBuilder.cpp
@@ -374,6 +374,13 @@ Value *BuilderImpl::CreateSubgroupBallotFindMsb(Value *const value, const Twine
 // @param instName : Name to give final instruction.
 Value *BuilderImpl::createSubgroupShuffle(const SubgroupHelperLaneState &state, Value *const value, Value *const index,
                                           ShaderStageEnum shaderStage, const Twine &instName) {
+  // TODO: Opportunity for uniformity analysis: We can also use readlane when the index is uniform.
+  if (isa<ConstantInt>(index)) {
+    auto mapFunc = [](BuilderBase &builder, ArrayRef<Value *> mapped, ArrayRef<Value *> passthrough) -> Value * {
+      return builder.CreateIntrinsic(builder.getInt32Ty(), Intrinsic::amdgcn_readlane, {mapped[0], passthrough[0]});
+    };
+    return CreateMapToSimpleType(mapFunc, value, index);
+  }
 
   if (supportWaveWideBPermute(shaderStage)) {
     auto mapFunc = [](BuilderBase &builder, ArrayRef<Value *> mappedArgs,
@@ -425,14 +432,7 @@ Value *BuilderImpl::createSubgroupShuffle(const SubgroupHelperLaneState &state,
     return result;
   }
 
-  auto mapFunc = [this](BuilderBase &builder, ArrayRef<Value *> mappedArgs,
-                        ArrayRef<Value *> passthroughArgs) -> Value * {
-    Value *const readlane =
-        builder.CreateIntrinsic(builder.getInt32Ty(), Intrinsic::amdgcn_readlane, {mappedArgs[0], passthroughArgs[0]});
-    return createWaterfallLoop(cast<Instruction>(readlane), 1);
-  };
-
-  return CreateMapToSimpleType(mapFunc, value, index);
+  return createShuffleLoop(state, value, index);
 }
 
 // =====================================================================================================================
@@ -1463,11 +1463,8 @@ Value *BuilderImpl::createGroupBallot(const SubgroupHelperLaneState &state, Valu
 llvm::Value *BuilderImpl::createShuffleLoop(const SubgroupHelperLaneState &state, llvm::Value *const value,
                                             llvm::Value *const index, const llvm::Twine &instName) {
   assert(value != nullptr && index != nullptr);
-  // Return readlane directly, if the index is a constant value.
-  if (isa<Constant>(index))
-    return CreateIntrinsic(getInt32Ty(), Intrinsic::amdgcn_readlane, {value, index});
 
-  // Creat workList out of loop
+  // Create workList out of loop
   // By implementation, the Insert point has been set to the callInst when call processCall
   auto *loopPoint = &*(GetInsertPoint());
   auto *originalBlock = loopPoint->getParent();
diff --git a/lgc/builder/YCbCrAddressHandler.cpp b/lgc/builder/YCbCrAddressHandler.cpp
index 3674048e1f..b8912956a2 100644
--- a/lgc/builder/YCbCrAddressHandler.cpp
+++ b/lgc/builder/YCbCrAddressHandler.cpp
@@ -136,8 +136,8 @@ void YCbCrAddressHandler::genHeightAndPitch(unsigned bits, unsigned bpp, unsigne
   m_swizzleMode = m_regHandler->getReg(SqRsrcRegs::SwizzleMode);
 
   switch (m_gfxIp->major) {
-  case 10:
-  case 11: {
+  case 11:
+  case 10: {
     const unsigned elementBytes = bpp >> 3;
     const unsigned pitchAlign = (256 / elementBytes);
 
diff --git a/lgc/elfLinker/ColorExportShader.cpp b/lgc/elfLinker/ColorExportShader.cpp
index d30622602e..216a7fdab7 100644
--- a/lgc/elfLinker/ColorExportShader.cpp
+++ b/lgc/elfLinker/ColorExportShader.cpp
@@ -53,7 +53,7 @@ ColorExportShader::ColorExportShader(PipelineState *pipelineState, ArrayRef<Colo
                                .getMap(true);
     m_killEnabled = dbShaderControl[Util::Abi::DbShaderControlMetadataKey::KillEnable].getBool();
   }
-  m_key = FragColorExport::computeKey(exports, pipelineState);
+  m_key = FragmentColorExport::computeKey(exports, pipelineState);
 }
 
 // =====================================================================================================================
@@ -121,7 +121,7 @@ Module *ColorExportShader::generate() {
   Function *colorExportFunc = createColorExportFunc();
 
   // Process each fragment output.
-  FragColorExport fragColorExport(m_lgcContext);
+  FragmentColorExport fragColorExport(m_lgcContext);
   auto ret = cast<ReturnInst>(colorExportFunc->back().getTerminator());
   BuilderBase builder(ret);
 
diff --git a/lgc/elfLinker/ColorExportShader.h b/lgc/elfLinker/ColorExportShader.h
index 8e013def42..b93021f4fd 100644
--- a/lgc/elfLinker/ColorExportShader.h
+++ b/lgc/elfLinker/ColorExportShader.h
@@ -83,7 +83,7 @@ class ColorExportShader : public GlueShader {
   // The encoded or hashed (in some way) single string version of the above.
   std::string m_shaderString;
   bool m_killEnabled; // True if this fragment shader has kill enabled.
-  FragColorExport::Key m_key;
+  FragmentColorExport::Key m_key;
 };
 
 } // namespace lgc
diff --git a/lgc/elfLinker/NullFragmentShader.cpp b/lgc/elfLinker/NullFragmentShader.cpp
index 4c1af783c6..b675dc9153 100644
--- a/lgc/elfLinker/NullFragmentShader.cpp
+++ b/lgc/elfLinker/NullFragmentShader.cpp
@@ -44,7 +44,7 @@ using namespace llvm;
 // @returns : The module containing the null fragment shader.
 Module *NullFragmentShader::generate() {
   Module *module = generateEmptyModule();
-  FragColorExport::generateNullFragmentShader(*module, m_pipelineState, getGlueShaderName());
+  FragmentColorExport::generateNullFragmentShader(*module, m_pipelineState, getGlueShaderName());
   return module;
 }
 
diff --git a/lgc/include/lgc/builder/BuilderImpl.h b/lgc/include/lgc/builder/BuilderImpl.h
index 5a5694a342..db5fbb5e8d 100644
--- a/lgc/include/lgc/builder/BuilderImpl.h
+++ b/lgc/include/lgc/builder/BuilderImpl.h
@@ -37,7 +37,7 @@
 namespace lgc {
 
 // Map vkgc - uint32 -1 is zero extended
-static constexpr uint64_t InternalDescriptorSetId = 0x00000000FFFFFFFFull;
+static constexpr uint64_t InternalDescriptorSetId = 0x00000000FFFFFFF0ull;
 
 // =====================================================================================================================
 // Builder implementation class
@@ -139,6 +139,10 @@ class BuilderImpl : public BuilderDefs {
   llvm::Value *CreateFpTruncWithRounding(llvm::Value *value, llvm::Type *destTy, llvm::RoundingMode roundingMode,
                                          const llvm::Twine &instName = "");
 
+  // The conversion between float8 and float32.
+  llvm::Value *CreateFp8Convert(llvm::Value *value, llvm::Type *destTy, bool isBfloat = false,
+                                const llvm::Twine &instName = "");
+
   // Create quantize operation.
   llvm::Value *CreateQuantizeToFp16(llvm::Value *value, const llvm::Twine &instName = "");
 
@@ -310,7 +314,7 @@ class BuilderImpl : public BuilderDefs {
   bool useVertexBufferDescArray();
 
   // Build buffer compact descriptor
-  llvm::Value *buildBufferCompactDesc(llvm::Value *desc, unsigned stride);
+  llvm::Value *buildBufferCompactDesc(llvm::Value *desc, llvm::Value *stride);
 
   // Build image sampler feedback descriptor
   llvm::Value *CreateSamplerFeedbackDesc(llvm::Value *feedbackDesc, llvm::Value *resourceDesc,
@@ -545,8 +549,6 @@ class BuilderImpl : public BuilderDefs {
                                   llvm::Value *elemIdx, unsigned &locationCount, InOutInfo &inOutInfo);
 
   std::tuple<unsigned, llvm::Value *> getInterpModeAndValue(InOutInfo inputInfo, llvm::Value *auxInterpValue);
-  llvm::Value *evalIjOffsetSmooth(llvm::Value *offset);
-  llvm::Value *adjustIj(llvm::Value *value, llvm::Value *offset);
 
   // Read (part of) a built-in value
   llvm::Value *readBuiltIn(bool isOutput, BuiltInKind builtIn, InOutInfo inOutInfo, llvm::Value *vertexIndex,
diff --git a/lgc/include/lgc/patch/FragmentColorExport.h b/lgc/include/lgc/patch/FragmentColorExport.h
index 72cdc9589a..bff7054e34 100644
--- a/lgc/include/lgc/patch/FragmentColorExport.h
+++ b/lgc/include/lgc/patch/FragmentColorExport.h
@@ -25,7 +25,7 @@
 /**
  ***********************************************************************************************************************
  * @file  FragmentColorExport.h
- * @brief LLPC header file: contains declaration of class lgc::FragColorExport.
+ * @brief LLPC header file: contains declaration of class lgc::FragmentColorExport.
  ***********************************************************************************************************************
  */
 #pragma once
@@ -57,7 +57,7 @@ enum class CompSetting : unsigned {
 
 // =====================================================================================================================
 // Represents the manager of fragment color export operations.
-class FragColorExport {
+class FragmentColorExport {
 public:
   // Color export Info
   struct Key {
@@ -69,7 +69,7 @@ class FragColorExport {
     ExportFormat dummyExpFmt;                   // Export format used for dummy "export" instruction.
   };
 
-  FragColorExport(LgcContext *context);
+  FragmentColorExport(LgcContext *context);
 
   void generateExportInstructions(llvm::ArrayRef<lgc::ColorExportInfo> info, llvm::ArrayRef<llvm::Value *> values,
                                   bool dummyExport, PalMetadata *palMetadata, BuilderBase &builder,
@@ -84,9 +84,9 @@ class FragColorExport {
   static Key computeKey(llvm::ArrayRef<ColorExportInfo> info, PipelineState *pipelineState);
 
 private:
-  FragColorExport() = delete;
-  FragColorExport(const FragColorExport &) = delete;
-  FragColorExport &operator=(const FragColorExport &) = delete;
+  FragmentColorExport() = delete;
+  FragmentColorExport(const FragmentColorExport &) = delete;
+  FragmentColorExport &operator=(const FragmentColorExport &) = delete;
   void updateColorExportInfoWithBroadCastInfo(const Key &key, llvm::ArrayRef<ColorExportInfo> originExpinfo,
                                               bool needMrt0a, llvm::SmallVector<ColorExportInfo> &outExpinfo,
                                               unsigned *pCbShaderMask);
@@ -114,9 +114,9 @@ struct ColorOutputValueInfo {
 
 // =====================================================================================================================
 // Pass to lower color export calls
-class LowerFragColorExport : public llvm::PassInfoMixin<LowerFragColorExport> {
+class LowerFragmentColorExport : public llvm::PassInfoMixin<LowerFragmentColorExport> {
 public:
-  LowerFragColorExport();
+  LowerFragmentColorExport();
   llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager);
 
   static llvm::StringRef name() { return "Lower fragment color export calls"; }
diff --git a/lgc/include/lgc/patch/PatchInitializeWorkgroupMemory.h b/lgc/include/lgc/patch/InitializeWorkgroupMemory.h
similarity index 90%
rename from lgc/include/lgc/patch/PatchInitializeWorkgroupMemory.h
rename to lgc/include/lgc/patch/InitializeWorkgroupMemory.h
index 29b0814b21..4de8a2c625 100644
--- a/lgc/include/lgc/patch/PatchInitializeWorkgroupMemory.h
+++ b/lgc/include/lgc/patch/InitializeWorkgroupMemory.h
@@ -24,8 +24,8 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchInitializeWorkgroupMemory.h
- * @brief LLPC header file: contains declaration of class lgc::PatchInitializeWorkgroupMemory.
+ * @file  InitializeWorkgroupMemory.h
+ * @brief LLPC header file: contains declaration of class lgc::InitializeWorkgroupMemory.
  ***********************************************************************************************************************
  */
 #pragma once
@@ -39,7 +39,7 @@ namespace lgc {
 
 // =====================================================================================================================
 // Represents the pass of setting up the value for workgroup global variables.
-class PatchInitializeWorkgroupMemory final : public Patch, public llvm::PassInfoMixin<PatchInitializeWorkgroupMemory> {
+class InitializeWorkgroupMemory final : public Patch, public llvm::PassInfoMixin<InitializeWorkgroupMemory> {
 public:
   llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager);
 
diff --git a/lgc/include/lgc/patch/LowerBufferOperations.h b/lgc/include/lgc/patch/LowerBufferOperations.h
index 2daa524fce..c6b809501b 100644
--- a/lgc/include/lgc/patch/LowerBufferOperations.h
+++ b/lgc/include/lgc/patch/LowerBufferOperations.h
@@ -25,18 +25,18 @@
 /**
  ***********************************************************************************************************************
  * @file  LowerBufferOperations.h
- * @brief LLPC header file: contains declaration of class lgc::PatchBufferOp.
+ * @brief LLPC header file: contains declaration of class lgc::LowerBufferOperations.
  ***********************************************************************************************************************
  */
 #pragma once
 
 #include "compilerutils/TypeLowering.h"
+#include "lgc/builder/BuilderImpl.h"
 #include "lgc/patch/LgcLowering.h"
 #include "llvm-dialects/Dialect/Visitor.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/PassManager.h"
 #include <utility>
@@ -76,6 +76,7 @@ class BufferOpLowering {
   struct DescriptorInfo {
     optional_bool invariant;
     optional_bool divergent;
+    optional_bool globallyCoherent;
   };
 
 public:
@@ -129,11 +130,10 @@ class BufferOpLowering {
                                          llvm::Value *const strideIndex, llvm::Type *const type,
                                          llvm::Instruction &inst,
                                          const llvm::function_ref<llvm::Value *(llvm::Value *)> callback);
-  llvm::Value *createCompactDesc(llvm::Value *const buffAddress, llvm::Value *const stride);
   llvm::Value *createLoadDesc(llvm::Value *buffAddress, bool forceRawView, bool isCompact);
 
   CompilerUtils::TypeLowering &m_typeLowering;
-  llvm::IRBuilder<> m_builder;
+  BuilderImpl m_builder;
 
   PipelineState &m_pipelineState;
   llvm::UniformityInfo &m_uniformityInfo;
@@ -155,7 +155,8 @@ class BufferOpLowering {
 
 // =====================================================================================================================
 // Represents the pass of LLVM patching operations for buffer operations
-class PatchBufferOp : public llvm::InstVisitor<PatchBufferOp>, public llvm::PassInfoMixin<PatchBufferOp> {
+class LowerBufferOperations : public llvm::InstVisitor<LowerBufferOperations>,
+                              public llvm::PassInfoMixin<LowerBufferOperations> {
 public:
   llvm::PreservedAnalyses run(llvm::Function &function, llvm::FunctionAnalysisManager &analysisManager);
 
diff --git a/lgc/include/lgc/patch/LowerCooperativeMatrix.h b/lgc/include/lgc/patch/LowerCooperativeMatrix.h
index 6e510a1368..3a18571d69 100644
--- a/lgc/include/lgc/patch/LowerCooperativeMatrix.h
+++ b/lgc/include/lgc/patch/LowerCooperativeMatrix.h
@@ -108,10 +108,11 @@ class LowerCooperativeMatrix : public Patch, public llvm::PassInfoMixin<LowerCoo
     unsigned microCount;
   };
 
-  TypeProperties getTypeProperties(CooperativeMatrixElementType elemType, CooperativeMatrixLayout layout) const;
+  TypeProperties getTypeProperties(CooperativeMatrixElementType elemType, CooperativeMatrixLayout layout,
+                                   unsigned kSize) const;
 
   ComputeAddressInfo computeAddressing(CooperativeMatrixLayout layout, CooperativeMatrixElementType elemType,
-                                       int waveSize, llvm::Value *stride, bool isColMajor, bool isFromPackedVal,
+                                       int waveSize, llvm::Value *stride, bool isColMajor,
                                        llvm::Instruction *insertPos);
 
   void visitCooperativeMatrixLengthOp(CooperativeMatrixLengthOp &matrixlength);
@@ -131,11 +132,13 @@ class LowerCooperativeMatrix : public Patch, public llvm::PassInfoMixin<LowerCoo
   // Convert vector data to cooperativeMatrix vec data
   // eg. v16*data_In_Buffer-->v8*coopMatrix_data as two 16bits elements packed.
   llvm::Value *convFlatVecToCoopMatrixVec(BuilderCommon &builder, llvm::Value *vecValue,
-                                          CooperativeMatrixElementType elemType, CooperativeMatrixLayout layout);
+                                          CooperativeMatrixElementType elemType, CooperativeMatrixLayout layout,
+                                          unsigned kSize = 16);
 
   // Convert cooperativeMatrix vec data to vec data.
   llvm::Value *convCoopMatrixVecToFlatVec(BuilderCommon &builder, llvm::Value *matrixValue,
-                                          CooperativeMatrixElementType elemType, CooperativeMatrixLayout layout);
+                                          CooperativeMatrixElementType elemType, CooperativeMatrixLayout layout,
+                                          unsigned kSize = 16);
 
   // Create cooperative matrix convert operation without reshape operation
   llvm::Value *cooperativeMatrixConvertInternal(llvm::CastInst::CastOps castOp, llvm::Value *source,
diff --git a/lgc/include/lgc/patch/LowerInOut.h b/lgc/include/lgc/patch/LowerInOut.h
index 12753252d1..91c65c273d 100644
--- a/lgc/include/lgc/patch/LowerInOut.h
+++ b/lgc/include/lgc/patch/LowerInOut.h
@@ -41,6 +41,9 @@
 
 namespace lgc {
 
+class EvalIjOffsetSmoothOp;
+class AdjustIjOp;
+
 // =====================================================================================================================
 // Represents the pass of LLVM patching operations for input import and output export.
 class LowerInOut : public Patch, public llvm::PassInfoMixin<LowerInOut> {
@@ -159,9 +162,8 @@ class LowerInOut : public Patch, public llvm::PassInfoMixin<LowerInOut> {
   llvm::Value *readValueFromLds(bool offChip, llvm::Type *readTy, llvm::Value *ldsOffset, BuilderBase &builder);
   void writeValueToLds(bool offChip, llvm::Value *writeValue, llvm::Value *ldsOffset, BuilderBase &builder);
 
-  unsigned calcPatchCountPerThreadGroup(unsigned inVertexCount, unsigned inVertexStride, unsigned outVertexCount,
-                                        unsigned outVertexStride, unsigned patchConstCount,
-                                        unsigned tessFactorStride) const;
+  unsigned calcMaxNumPatchesPerGroup(unsigned inputVertexCount, unsigned outputVertexCount, unsigned tessFactorCount,
+                                     unsigned ldsSizePerPatch, unsigned ldsBufferSizePerPatch) const;
 
   llvm::Value *calcLdsOffsetForVsOutput(llvm::Type *outputTy, unsigned location, unsigned compIdx,
                                         BuilderBase &builder);
@@ -189,8 +191,12 @@ class LowerInOut : public Patch, public llvm::PassInfoMixin<LowerInOut> {
   llvm::Value *getPrimType(BuilderBase &builder);
   llvm::Value *getLineStipple(BuilderBase &builderBase);
 
-  void recordVertexAttribExport(unsigned location, llvm::ArrayRef<llvm::Value *> attribValues);
-  void exportVertexAttribs(BuilderBase &builder);
+  void recordVertexAttribute(unsigned exportSlot, llvm::ArrayRef<llvm::Value *> exportValues);
+  void exportAttributes(BuilderBase &builder);
+  void exportPosition(unsigned exportSlot, llvm::ArrayRef<llvm::Value *> exportValues, BuilderBase &builder);
+
+  void visitEvalIjOffsetSmoothOp(EvalIjOffsetSmoothOp &op);
+  void visitAdjustIjOp(AdjustIjOp &op);
 
   GfxIpVersion m_gfxIp;                     // Graphics IP version info
   PipelineSystemValues m_pipelineSysValues; // Cache of ShaderSystemValues objects, one per shader stage
@@ -218,8 +224,9 @@ class LowerInOut : public Patch, public llvm::PassInfoMixin<LowerInOut> {
 
   std::vector<llvm::CallInst *> m_importCalls; // List of "call" instructions to import inputs
   std::vector<llvm::CallInst *> m_exportCalls; // List of "call" instructions to export outputs
+  std::vector<llvm::CallInst *> m_gsMsgCalls;  // List of "call" instructions to send GS message
   llvm::SmallDenseMap<unsigned, std::array<llvm::Value *, 4>>
-      m_attribExports;                      // Export info of vertex attributes: <attrib loc, attrib values>
+      m_attribExports;                      // Export info of vertex attributes: <export slot, export values>
   PipelineState *m_pipelineState = nullptr; // Pipeline state from PipelineStateWrapper pass
 
   std::set<unsigned> m_expLocs; // The locations that already have an export instruction for the vertex shader.
diff --git a/lgc/include/lgc/patch/PassthroughHullShader.h b/lgc/include/lgc/patch/PassthroughHullShader.h
index d1f2c7b330..de81743eff 100644
--- a/lgc/include/lgc/patch/PassthroughHullShader.h
+++ b/lgc/include/lgc/patch/PassthroughHullShader.h
@@ -25,7 +25,7 @@
 /**
  ***********************************************************************************************************************
  * @file  PassthroughHullShader.h
- * @brief LLPC header file: contains declaration of class lgc::TcsPassthroughShader.
+ * @brief LLPC header file: contains declaration of class lgc::PassthroughHullShader.
  ***********************************************************************************************************************
  */
 #pragma once
@@ -38,17 +38,17 @@ namespace lgc {
 
 // =====================================================================================================================
 // Pass to generate tessellation control pass-through shader
-class TcsPassthroughShader : public llvm::PassInfoMixin<TcsPassthroughShader> {
+class PassthroughHullShader : public llvm::PassInfoMixin<PassthroughHullShader> {
 public:
   llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager);
 
   static llvm::StringRef name() { return "Pass-through hull shader generation"; }
   void updatePipelineState(llvm::Module &module, PipelineState *pipelineState) const;
-  llvm::Function *generateTcsPassthroughShader(llvm::Module &module, PipelineShadersResult &pipelineShaders,
-                                               PipelineState *pipelineState);
+  llvm::Function *generatePassthroughHullShader(llvm::Module &module, PipelineShadersResult &pipelineShaders,
+                                                PipelineState *pipelineState);
   llvm::Function *generateTcsPassthroughEntryPoint(llvm::Module &module, PipelineState *pipelineState);
-  void generateTcsPassthroughShaderBody(llvm::Module &module, PipelineShadersResult &pipelineShaders,
-                                        PipelineState *pipelineState, llvm::Function *entryPoint);
+  void generatePassthroughHullShaderBody(llvm::Module &module, PipelineShadersResult &pipelineShaders,
+                                         PipelineState *pipelineState, llvm::Function *entryPoint);
 };
 
 } // namespace lgc
diff --git a/lgc/include/lgc/patch/PeepholeOptimization.h b/lgc/include/lgc/patch/PeepholeOptimization.h
index a8707bcdc7..32daae8943 100644
--- a/lgc/include/lgc/patch/PeepholeOptimization.h
+++ b/lgc/include/lgc/patch/PeepholeOptimization.h
@@ -30,8 +30,6 @@
  */
 #pragma once
 
-#include "lgc/util/Internal.h"
-#include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 
@@ -45,19 +43,11 @@ namespace lgc {
 //
 // - Change log2 ( const +/- x ) -> log2 ( max ( 0.0, const +/- x ) ) to avoid application underflow.
 //
-class PeepholeOptimization final : public llvm::InstVisitor<PeepholeOptimization>,
-                                   public llvm::PassInfoMixin<PeepholeOptimization> {
+class PeepholeOptimization final : public llvm::PassInfoMixin<PeepholeOptimization> {
 public:
   llvm::PreservedAnalyses run(llvm::Function &function, llvm::FunctionAnalysisManager &analysisManager);
 
   static llvm::StringRef name() { return "Peephole optimizations"; }
-
-  void visitIntToPtr(llvm::IntToPtrInst &intToPtr);
-  void visitCallInst(llvm::CallInst &callInst);
-
-private:
-  bool m_changed;
-  llvm::SmallVector<llvm::Instruction *, 8> m_instsToErase;
 };
 
 } // namespace lgc
diff --git a/lgc/include/lgc/patch/PreparePipelineAbi.h b/lgc/include/lgc/patch/PreparePipelineAbi.h
index b70825b145..03f8761a8a 100644
--- a/lgc/include/lgc/patch/PreparePipelineAbi.h
+++ b/lgc/include/lgc/patch/PreparePipelineAbi.h
@@ -61,7 +61,10 @@ class PreparePipelineAbi final : public Patch, public llvm::PassInfoMixin<Prepar
                                                                  llvm::IRBuilder<> &builder);
   static void writeTessFactors(PipelineState *pipelineState, llvm::Value *tfBufferDesc, llvm::Value *tfBufferBase,
                                llvm::Value *relPatchId, llvm::Value *outerTf, llvm::Value *innerTf,
-                               llvm::IRBuilder<> &builder);
+                               BuilderBase &builder);
+  static void writeHsOutputs(PipelineState *pipelineState, llvm::Value *offChipLdsDesc, llvm::Value *offChipLdsBase,
+                             llvm::Value *relPatchId, llvm::Value *vertexIdx, llvm::Value *outerTf,
+                             BuilderBase &builder);
 
 private:
   void mergeShader(llvm::Module &module);
@@ -70,7 +73,7 @@ class PreparePipelineAbi final : public Patch, public llvm::PassInfoMixin<Prepar
 
   void addAbiMetadata(llvm::Module &module);
 
-  void storeTessFactors(llvm::Function *entryPoint);
+  void storeTessFactorsAndHsOutputs(llvm::Function *entryPoint);
 
   PipelineState *m_pipelineState;           // Pipeline state
   PipelineShadersResult *m_pipelineShaders; // API shaders in the pipeline
diff --git a/lgc/include/lgc/patch/SetupTargetFeatures.h b/lgc/include/lgc/patch/SetupTargetFeatures.h
index 8bbcf55b70..03b81c50d1 100644
--- a/lgc/include/lgc/patch/SetupTargetFeatures.h
+++ b/lgc/include/lgc/patch/SetupTargetFeatures.h
@@ -25,7 +25,7 @@
 /**
  ***********************************************************************************************************************
  * @file  SetupTargetFeatures.h
- * @brief LLPC header file: contains declaration of class lgc::PatchSetupTargetFeatures.
+ * @brief LLPC header file: contains declaration of class lgc::SetUpTargetFeatures.
  ***********************************************************************************************************************
  */
 #pragma once
@@ -39,7 +39,7 @@ namespace lgc {
 
 // =====================================================================================================================
 // Pass to set up target features on shader entry-points
-class PatchSetupTargetFeatures : public Patch, public llvm::PassInfoMixin<PatchSetupTargetFeatures> {
+class SetUpTargetFeatures : public Patch, public llvm::PassInfoMixin<SetUpTargetFeatures> {
 public:
   llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager);
 
diff --git a/lgc/include/lgc/patch/ShaderInputs.h b/lgc/include/lgc/patch/ShaderInputs.h
index d7b139a53f..1b1161770c 100644
--- a/lgc/include/lgc/patch/ShaderInputs.h
+++ b/lgc/include/lgc/patch/ShaderInputs.h
@@ -179,7 +179,7 @@ class ShaderInputs {
   static llvm::CallInst *getSpecialUserData(UserDataMapping kind, BuilderBase &builder);
 
   // Get a special user data value as a pointer by inserting a call to lgc.special.user.data then extending it
-  static llvm::Value *getSpecialUserDataAsPointer(UserDataMapping kind, llvm::Type *pointeeTy, BuilderBase &builder);
+  static llvm::Value *getSpecialUserDataAsPointer(UserDataMapping kind, BuilderBase &builder);
 
   // Get VertexIndex
   static llvm::Value *getVertexIndex(BuilderBase &builder, const LgcContext &lgcContext);
diff --git a/lgc/include/lgc/state/Abi.h b/lgc/include/lgc/state/Abi.h
index 7f8aeba066..c090c2070d 100644
--- a/lgc/include/lgc/state/Abi.h
+++ b/lgc/include/lgc/state/Abi.h
@@ -49,7 +49,7 @@ static const unsigned SiDrvTableGsRingOuT2Offs = 24;
 static const unsigned SiDrvTableGsRingOuT3Offs = 28;
 static const unsigned SiDrvTableVsRingInOffs = 32;
 static const unsigned SiDrvTableTfBufferOffs = 36;
-static const unsigned SiDrvTableHsBuffeR0Offs = 40;
+static const unsigned SiDrvTableHsBufferOffs = 40;
 static const unsigned SiDrvTableOffChipParamCache = 44;
 static const unsigned SiDrvTableSamplepos = 48;
 static const unsigned SiDrvTableTaskPayloadRingOffs = 52;
diff --git a/lgc/include/lgc/state/Defs.h b/lgc/include/lgc/state/Defs.h
index 073a144f66..8f12742c7e 100644
--- a/lgc/include/lgc/state/Defs.h
+++ b/lgc/include/lgc/state/Defs.h
@@ -43,7 +43,6 @@ const static char OutputCallPrefix[] = "lgc.output.";
 const static char OutputImportBuiltIn[] = "lgc.output.import.builtin.";
 const static char OutputExportGeneric[] = "lgc.output.export.generic.";
 const static char OutputExportBuiltIn[] = "lgc.output.export.builtin.";
-const static char OutputExportXfb[] = "lgc.output.export.xfb.";
 const static char ReconfigureLocalInvocationId[] = "lgc.reconfigure.local.invocation.id";
 const static char SwizzleWorkgroupId[] = "lgc.swizzle.workgroup.id";
 
@@ -61,11 +60,6 @@ const static char ImmutableConvertingSamplerGlobal[] = "lgc.immutable.converting
 // Names of entry-points for merged shader
 const static char EsGsEntryPoint[] = "lgc.shader.ESGS.main";
 const static char LsHsEntryPoint[] = "lgc.shader.LSHS.main";
-
-const static char NggAttributeThroughMemory[] = "lgc.ngg.attribute.through.memory";
-const static char NggXfbExport[] = "lgc.ngg.xfb.export.";
-const static char NggWriteGsOutput[] = "lgc.ngg.write.GS.output.";
-const static char NggReadGsOutput[] = "lgc.ngg.read.GS.output.";
 const static char NggPrimShaderEntryPoint[] = "lgc.shader.PRIM.main";
 
 const static char EntryPointPrefix[] = "lgc.shader.";
@@ -94,9 +88,6 @@ static const unsigned MaxTransformFeedbackBuffers = 4;
 static const unsigned MaxGsStreams = 4;
 static_assert(MaxGsStreams == MaxTransformFeedbackBuffers, "Unexpected value!");
 
-// Maximum tess factors per patch
-static const unsigned MaxTessFactorsPerPatch = 6; // 4 outer factors + 2 inner factors
-
 static const char RayQueryLdsStackName[] = "LdsStack";
 // NOTE: Currently, we restrict the max thread count of ray query to be 64 and make sure the wave size is 64. This is
 // because we don't provide the capability of querying thread ID in group for ray query in vertex processing shaders.
diff --git a/lgc/include/lgc/state/IntrinsDefs.h b/lgc/include/lgc/state/IntrinsDefs.h
index 522e5920c2..5442b82df1 100644
--- a/lgc/include/lgc/state/IntrinsDefs.h
+++ b/lgc/include/lgc/state/IntrinsDefs.h
@@ -43,26 +43,12 @@ static const unsigned MaxGeometryOutputVertices = (1 << 11) - 1;
 static const unsigned MaxComputeWorkgroupSize = (1 << 16) - 1;
 
 // Messages that can be generated by using s_sendmsg
-static const unsigned HsTessFactor = 2;  // HS Tessellation factor is all zero or one
-static const unsigned GsDone = 3;        // GS wave is done
-static const unsigned OrderedPsDone = 7; // Signal end of primitive ordered pixel shading critical section
-static const unsigned GsAllocReq = 9;    // GS requests that parameter cache space be allocated
-static const unsigned GsCut = 0x12;      // [3:0] = 2 (GS), [5:4] = 1 (cut)
-static const unsigned GsEmit = 0x22;     // [3:0] = 2 (GS), [5:4] = 2 (emit)
-
-static const unsigned GsCutStream0 = 0x12;  // [3:0] = 2 (GS), [5:4] = 1 (cut), [9:8] = 0 (stream0)
-static const unsigned GsCutStream1 = 0x112; // [3:0] = 2 (GS), [5:4] = 1 (cut), [9:8] = 1 (stream1)
-static const unsigned GsCutStream2 = 0x212; // [3:0] = 2 (GS), [5:4] = 1 (cut), [9:8] = 2 (stream2)
-static const unsigned GsCutStream3 = 0x312; // [3:0] = 2 (GS), [5:4] = 1 (cut), [9:8] = 3 (stream3)
-
-static const unsigned GsEmitStream0 = 0x22;  // [3:0] = 2 (GS), [5:4] = 2 (emit), [9:8] = 0 (stream0)
-static const unsigned GsEmitStream1 = 0x122; // [3:0] = 2 (GS), [5:4] = 2 (emit), [9:8] = 1 (stream1)
-static const unsigned GsEmitStream2 = 0x222; // [3:0] = 2 (GS), [5:4] = 2 (emit), [9:8] = 2 (stream2)
-static const unsigned GsEmitStream3 = 0x322; // [3:0] = 2 (GS), [5:4] = 2 (emit), [9:8] = 3 (stream3)
-
-static const unsigned GsEmitCutStreamIdShift = 0x8;  // Shift of STREAM_ID of the message GS_EMIT/GS_CUT
-static const unsigned GsEmitCutStreamIdMask = 0x300; // Mask of STREAM_ID of the message GS_EMIT/GS_CUT
-
+static const unsigned HsTessFactor = 2;   // HS Tessellation factor is all zero or one
+static const unsigned GsDone = 3;         // GS wave is done
+static const unsigned OrderedPsDone = 7;  // Signal end of primitive ordered pixel shading critical section
+static const unsigned GsAllocReq = 9;     // GS requests that parameter cache space be allocated
+static const unsigned GsCut = 0x12;       // [3:0] = 2 (GS), [5:4] = 1 (cut)
+static const unsigned GsEmit = 0x22;      // [3:0] = 2 (GS), [5:4] = 2 (emit)
 static const unsigned GetRealTime = 0x83; // [7] = 1, [6:0] = 3
 
 // Count of user SGPRs used in copy shader
@@ -505,6 +491,8 @@ union SqBufRsrcWord1 {
   unsigned u32All;
 };
 
+constexpr uint32_t SqBufRsrcTWord1StrideShift = 16;
+
 // Represents the third dword of buffer descriptor SQ_BUF_RSRC_WORD2.
 union SqBufRsrcWord2 {
   struct {
diff --git a/lgc/include/lgc/state/PipelineState.h b/lgc/include/lgc/state/PipelineState.h
index 66c6aec626..8d6d28dfe4 100644
--- a/lgc/include/lgc/state/PipelineState.h
+++ b/lgc/include/lgc/state/PipelineState.h
@@ -314,11 +314,12 @@ class PipelineState final : public Pipeline {
 
   // Gets wave size for the specified shader stage
   unsigned getShaderWaveSize(ShaderStageEnum stage);
-  // Gets wave size for the merged shader stage
-  unsigned getMergedShaderWaveSize(ShaderStageEnum stage);
   // Gets subgroup size for the specified shader stage
   unsigned getShaderSubgroupSize(ShaderStageEnum stage);
 
+  // Gets shader stage for the specified shader stage that it will merge with
+  ShaderStageEnum getMergingShaderStage(ShaderStageEnum stage);
+
   // Gets Util::Abi::PipelineType for pipeline
   unsigned getAbiPipelineType();
   // Gets map of ShaderStageEnum to Util::Abi::HardwareStageFlagBits
@@ -604,6 +605,11 @@ class PipelineState final : public Pipeline {
   void recordGraphicsState(llvm::Module *module);
   void readGraphicsState(llvm::Module *module);
 
+  // Wave sizes handling
+  void recordWaveSize(llvm::Module *module);
+  void readWaveSize(llvm::Module *module);
+  void determineShaderWaveSize(llvm::Module *module);
+
   // ABI Shader Map
   void buildAbiHwShaderMap();
 
@@ -643,8 +649,8 @@ class PipelineState final : public Pipeline {
   ShaderStageMap<std::unique_ptr<ResourceUsage>> m_resourceUsage;   // Per-shader ResourceUsage
   ShaderStageMap<std::unique_ptr<InterfaceData>> m_interfaceData;   // Per-shader InterfaceData
   PalMetadata *m_palMetadata = nullptr;                             // PAL metadata object
-  ShaderStageMap<unsigned> m_waveSize;                              // Per-shader wave size
-  ShaderStageMap<unsigned> m_subgroupSize;                          // Per-shader subgroup size
+  unsigned m_waveSize[ShaderStage::Count] = {};                     // Per-shader wave size
+  unsigned m_subgroupSize[ShaderStage::Count] = {};                 // Per-shader subgroup size
   ShaderStageMap<bool> m_inputPackState;                            // The input packable state per shader stage
   ShaderStageMap<bool> m_outputPackState;                           // The output packable state per shader stage
   XfbStateMetadata m_xfbStateMetadata = {};                         // Transform feedback state metadata
diff --git a/lgc/include/lgc/state/ResourceUsage.h b/lgc/include/lgc/state/ResourceUsage.h
index 270cd32b02..a62b2374f9 100644
--- a/lgc/include/lgc/state/ResourceUsage.h
+++ b/lgc/include/lgc/state/ResourceUsage.h
@@ -398,44 +398,45 @@ struct ResourceUsage {
 
     struct {
       struct {
-        unsigned inVertexStride;           // Stride of vertices of input patch (in dword, correspond to
-                                           // "lsStride")
-        unsigned outVertexStride;          // Stride of vertices of output patch (in dword, correspond to
-                                           // "hsCpStride")
-        unsigned patchCountPerThreadGroup; // Count of patches per thread group (in dword, correspond to
-                                           // "hsNumPatch")
-        // On-chip calculation factors
+        // On-chip configurations (in dwords)
         struct {
-          unsigned outPatchStart;       // Offset into LDS where vertices of output patches start
-                                        // (in dword, correspond to "hsOutputBase")
-          unsigned patchConstStart;     // Offset into LDS where patch constants start (in dword,
-                                        // correspond to "patchConstBase")
-          unsigned hsPatchCountStart;   // Offset into LDS where count of HS patches start (in dword)
-          unsigned specialTfValueStart; // Offset into LDS where special TF value start (in dword)
-          unsigned inPatchStart;        // Offset into LDS where vertices of input patches start (in dword)
+          unsigned hsPatchCountStart; // Offset into LDS where HS patch count starts
+
+          unsigned specialTfValueStart; // Offset into LDS where special TF values start
+          unsigned specialTfValueSize;  // Size of special TF value (in dword)
+
+          unsigned tessFactorStart;  // Offset into LDS where TFs start
+          unsigned tessFactorStride; // Size of tess factor stride (in dword)
+
+          unsigned outputPatchStart;   // Offset into LDS where vertices of output patches start
+          unsigned outputVertexStride; // Stride of vertices of output patch (in dwords)
+          unsigned outputPatchSize;    // Size of an output patch output (in dwords)
+
+          unsigned patchConstStart; // Offset into LDS where patch constants start
+          unsigned patchConstSize;  // Size of an output patch constant (in dwords)
+
+          unsigned inputPatchStart;   // Offset into LDS where vertices of input patches start
+          unsigned inputVertexStride; // Stride of vertices of input patch (in dwords)
+          unsigned inputPatchSize;    // Size of an input patch size (in dwords)
         } onChip;
 
-        // Off-chip calculation factors
+        // Off-chip configurations (in dwords)
         struct {
-          unsigned outPatchStart;   // Offset into LDS where vertices of output patches start
-                                    // (in dword, correspond to "hsOutputBase")
-          unsigned patchConstStart; // Offset into LDS where patch constants start (in dword,
-                                    // correspond to "patchConstBase")
-        } offChip;
+          unsigned outputPatchStart;   // Offset into LDS where vertices of output patches start
+          unsigned outputVertexStride; // Stride of vertices of output patch (in dwords)
+          unsigned outputPatchSize;    // Size of an output patch output (in dwords)
 
-        unsigned inPatchSize; // size of an input patch size (in dword)
+          unsigned patchConstStart; // Offset into LDS where patch constants start
+          unsigned patchConstSize;  // Size of an output patch constant (in dwords)
+        } offChip;
 
-        unsigned outPatchSize; // Size of an output patch output (in dword, correspond to
-                               // "patchOutputSize")
+        unsigned maxNumPatchesPerGroup; // Maximum number of patches per thread group
 
-        unsigned patchConstSize;       // Size of an output patch constants (in dword)
-        unsigned tessFactorStride;     // Size of tess factor stride (in dword)
-        unsigned specialTfValueSize;   // Size of special TF value (in dword)
         unsigned tessOnChipLdsSize;    // On-chip LDS size (exclude off-chip LDS buffer) (in dword)
         unsigned rayQueryLdsStackSize; // Ray query LDS stack size
 
-        bool initialized; // Whether calcFactor has been initialized
-      } calcFactor;
+        bool initialized; // Whether hwConfig has been initialized
+      } hwConfig;
     } tcs = {};
 
     struct {
@@ -460,7 +461,7 @@ struct ResourceUsage {
         unsigned primAmpFactor;                    // GS primitive amplification factor
         bool enableMaxVertOut;                     // Whether to allow each GS instance to emit maximum vertices (NGG)
         unsigned rayQueryLdsStackSize;             // Ray query LDS stack size
-      } calcFactor = {};
+      } hwConfig = {};
 
       unsigned outLocCount[MaxGsStreams] = {};
     } gs;
diff --git a/lgc/interface/lgc/Builder.h b/lgc/interface/lgc/Builder.h
index 4741cc3f91..54977746be 100644
--- a/lgc/interface/lgc/Builder.h
+++ b/lgc/interface/lgc/Builder.h
@@ -179,7 +179,8 @@ class BuilderDefs : public BuilderCommon {
     BufferFlagSampler = 32,          // Flag to find Descriptor Sampler
     BufferFlagAddress = 64,          // Flag to return an i64 address of the descriptor
     BufferFlagAttachedCounter = 128, // Flag to return the counter buffer descriptor attached to the main buffer.
-    BufferFlagForceRawView = 256     // Flag to convert the buffer descriptor to raw view.
+    BufferFlagForceRawView = 256,    // Flag to convert the buffer descriptor to raw view.
+    BufferFlagCoherent = 512,        // Coherent memory access
   };
 
   // Get the type of a built-in -- static edition of the method below, so you can use it without a BuilderDefs object.
@@ -323,6 +324,7 @@ class BuilderDefs : public BuilderCommon {
     ImageFlagNotAliased = 0x200,                  // Whether the image is known not to alias any other memory object
     ImageFlagInvariant = 0x400,                   // Invariant load
     ImageFlagSamplePatternOffset = 0x800,         // Retrieving sample pattern offset in dwords for specified image
+    ImageFlagNoAlloc = 0x1000,                    //
   };
 
   // Address array indices for image sample and gather methods. Where an optional entry is missing (either
@@ -510,6 +512,15 @@ class Builder : public BuilderDefs {
   llvm::Value *CreateFpTruncWithRounding(llvm::Value *value, llvm::Type *destTy, llvm::RoundingMode roundingMode,
                                          const llvm::Twine &instName = "");
 
+  // The conversion between float8 and float32.
+  //
+  // @param value : Input value
+  // @param destTy : Type to convert to
+  // @param isBfloat : Whether dest type is bfloat8
+  // @param instName : Name to give instruction(s)
+  llvm::Value *CreateFp8Convert(llvm::Value *value, llvm::Type *destTy, bool isBfloat = false,
+                                const llvm::Twine &instName = "");
+
   // Create quantize operation: truncates float (or vector) value to a value that is representable by a half.
   //
   // @param value : Input value (float or float vector)
diff --git a/lgc/interface/lgc/BuilderCommon.h b/lgc/interface/lgc/BuilderCommon.h
index e5d3efdb32..d850572511 100644
--- a/lgc/interface/lgc/BuilderCommon.h
+++ b/lgc/interface/lgc/BuilderCommon.h
@@ -117,8 +117,9 @@ class BuilderCommon : public llvm_dialects::Builder {
   // Convert the element type enum into the corresponding LLVM type.
   llvm::Type *transCooperativeMatrixElementType(CooperativeMatrixElementType elemType);
 
-  // Get the LGC type of a cooperative matrix with the given element type and layout.
-  llvm::Type *getCooperativeMatrixTy(CooperativeMatrixElementType elemType, CooperativeMatrixLayout layout);
+  // Get the LGC type of a cooperative matrix with the given element type, layout and K size.
+  llvm::Type *getCooperativeMatrixTy(CooperativeMatrixElementType elemType, CooperativeMatrixLayout layout,
+                                     unsigned kSize = 16);
 
   // Whether the type of a cooperative matrix is specified bit width.
   static bool isTypeNCooperativeMatrix(CooperativeMatrixElementType elemType, unsigned bitWidth);
diff --git a/lgc/interface/lgc/BuiltInDefs.h b/lgc/interface/lgc/BuiltInDefs.h
index b0c6ef281f..c7fd281ff7 100644
--- a/lgc/interface/lgc/BuiltInDefs.h
+++ b/lgc/interface/lgc/BuiltInDefs.h
@@ -127,7 +127,6 @@ BUILTIN(InterpLinearCentroid, BuiltInInternalBase + 6, N, P, v2f32)
 BUILTIN(SamplePosOffset, BuiltInInternalBase + 7, N, P, v2f32)
 BUILTIN(NumSamples, BuiltInInternalBase + 8, N, P, i32)
 BUILTIN(SamplePatternIdx, BuiltInInternalBase + 9, N, P, i32)
-BUILTIN(GsWaveId, BuiltInInternalBase + 10, N, G, i32)
 
 // Internal built-ins for compute input when thread id is swizzled
 BUILTIN(UnswizzledLocalInvocationId, BuiltInInternalBase + 11, N, C, i32)
diff --git a/lgc/include/lgc/util/Debug.h b/lgc/interface/lgc/Debug.h
similarity index 64%
rename from lgc/include/lgc/util/Debug.h
rename to lgc/interface/lgc/Debug.h
index 021d2d099c..16291ec3b1 100644
--- a/lgc/include/lgc/util/Debug.h
+++ b/lgc/interface/lgc/Debug.h
@@ -30,6 +30,8 @@
  */
 #pragma once
 
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/Function.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace lgc {
@@ -37,6 +39,40 @@ namespace lgc {
 // Get pointer to stream for LLPC_OUTS, or nullptr if disabled.
 llvm::raw_ostream *getLgcOuts();
 
+// Create Indexed instruction slots for all the instruction of the function, the instruction in the function
+// can later be referenced by index
+// Usage:
+//   1) use amdllpc/llpc -print-after-all to get the module pass dump, instructions names/indices
+//   2) At the end of the pass processing place, you can insert instructions
+
+//     for (Function &decl : module) {
+//         if (decl.getName().ends_with("RayGen@@YAXXZ")) {
+//             lgc::Builder builder(module.getContext());
+//             InstructionSlot instSlot;
+//             instSlot.createFuncSlot(&decl);
+//             auto bufferDesc = instSlot.getValueByIdx(46);
+//             auto nextPos = (cast<Instruction>(bufferDesc))->getNextNode();
+//             builder.SetInsertPoint(nextPos);
+//             SmallVector<Value *> lists = {bufferDesc};
+//             builder.create<lgc::DebugPrintfOp>("desc:%d\n", lists);
+//        }
+//     }
+
+class InstructionSlot {
+  unsigned m_valueIndex = 0;
+  using IndexMap = llvm::DenseMap<unsigned, llvm::Value *>;
+  using NamedMap = llvm::DenseMap<llvm::StringRef, llvm::Value *>;
+  IndexMap m_iMap;
+  NamedMap m_nMap;
+
+  void createSlot(llvm::Value *val);
+
+public:
+  InstructionSlot(llvm::Function *func) { createFuncSlot(func); }
+  void createFuncSlot(llvm::Function *func);
+  llvm::Value *getValueByIdx(unsigned idx);
+  llvm::Value *getValueByName(llvm::StringRef name);
+};
 } // namespace lgc
 
 // Output general message
diff --git a/lgc/interface/lgc/LgcContext.h b/lgc/interface/lgc/LgcContext.h
index f8c21fa4d1..270bea727f 100644
--- a/lgc/interface/lgc/LgcContext.h
+++ b/lgc/interface/lgc/LgcContext.h
@@ -31,6 +31,7 @@
 #pragma once
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -140,7 +141,7 @@ class LgcContext {
 
   // Utility method to create a start/stop timer pass and add it to the given
   // pass manager
-  static void createAndAddStartStopTimer(lgc::PassManager &passMgr, llvm::Timer *timer, bool starting);
+  static void createAndAddStartStopTimer(llvm::ModulePassManager &passMgr, llvm::Timer *timer, bool starting);
 
   // Set and get a pointer to the stream used for LLPC_OUTS. This is initially nullptr,
   // signifying no output from LLPC_OUTS. Setting this to a stream means that LLPC_OUTS
diff --git a/lgc/interface/lgc/LgcDialect.td b/lgc/interface/lgc/LgcDialect.td
index 430b975c50..97ca648e50 100644
--- a/lgc/interface/lgc/LgcDialect.td
+++ b/lgc/interface/lgc/LgcDialect.td
@@ -56,7 +56,7 @@ class LgcOp<string mnemonic_, list<Trait> traits_ = []>
                                                               >;
 
 def BufferAddrToPtrOp : LgcOp<"buffer.addr.to.ptr", [Memory<[]>, WillReturn]> {
-  let arguments = (ins I64:$address);
+  let arguments = (ins I64:$address, AttrI1:$globallyCoherent);
   let results = (outs BufferPointer:$result);
 
   let summary = "convert a buffer address into a buffer fat pointer";
@@ -69,7 +69,7 @@ def BufferAddrToPtrOp : LgcOp<"buffer.addr.to.ptr", [Memory<[]>, WillReturn]> {
 }
 
 def BufferDescToPtrOp : LgcOp<"buffer.desc.to.ptr", [Memory<[]>, WillReturn]> {
-  let arguments = (ins V4I32:$desc);
+  let arguments = (ins V4I32:$desc, AttrI1:$globallyCoherent);
   let results = (outs BufferPointer:$result);
 
   let summary = "convert a buffer descriptor into a buffer pointer";
@@ -94,7 +94,7 @@ def ConvertToStridedBufferPointerOp : LgcOp<"convert.to.strided.buffer.pointer",
 }
 
 def StridedBufferDescToPtrOp : LgcOp<"strided.buffer.desc.to.ptr", [Memory<[]>, WillReturn]> {
-  let arguments = (ins V4I32:$desc);
+  let arguments = (ins V4I32:$desc, AttrI1:$globallyCoherent);
   let results = (outs BufferStridedPointer:$result);
 
   let summary = "convert a buffer descriptor into a strided buffer pointer";
@@ -104,7 +104,7 @@ def StridedBufferDescToPtrOp : LgcOp<"strided.buffer.desc.to.ptr", [Memory<[]>,
 }
 
 def StridedBufferAddrAndStrideToPtrOp : LgcOp<"strided.buffer.addr.and.stride.to.ptr", [Memory<[]>, WillReturn]> {
-  let arguments = (ins I64:$address, I32:$stride);
+  let arguments = (ins I64:$address, I32:$stride, AttrI1:$globallyCoherent);
   let results = (outs BufferStridedPointer:$result);
 
   let summary = "convert a buffer address and a stride into a strided buffer pointer";
@@ -198,7 +198,7 @@ def LoadStridedBufferDescOp : LgcOp<"load.strided.buffer.desc", [Memory<[]>, Wil
 }
 
 def BufferLoadDescToPtrOp : LgcOp<"buffer.load.desc.to.ptr", [Memory<[]>, WillReturn]> {
-  let arguments = (ins ConstantPointer:$descPtr, AttrI1:$forceRawView, AttrI1:$isCompact);
+  let arguments = (ins ConstantPointer:$descPtr, AttrI1:$forceRawView, AttrI1:$isCompact, AttrI1:$globallyCoherent);
   let results = (outs BufferPointer:$result);
 
   let summary = "convert a constant buffer pointer to a fat buffer pointer and implicitly do the load";
@@ -208,7 +208,7 @@ def BufferLoadDescToPtrOp : LgcOp<"buffer.load.desc.to.ptr", [Memory<[]>, WillRe
 }
 
 def StridedBufferLoadDescToPtrOp : LgcOp<"stride.buffer.load.desc.to.ptr", [Memory<[]>, WillReturn]> {
-  let arguments = (ins ConstantPointer:$descPtr, AttrI1:$forceRawView, AttrI1:$isCompact, I32:$stride);
+  let arguments = (ins ConstantPointer:$descPtr, AttrI1:$forceRawView, AttrI1:$isCompact, AttrI1:$globallyCoherent, I32:$stride);
   let results = (outs BufferStridedPointer:$result);
 
   let summary = "convert a constant buffer pointer to a fat strided buffer pointer and implicitly do the load";
@@ -307,7 +307,7 @@ def GetMeshBuiltinInputOp : DivergentLgcOp<"get.mesh.builtin.input", [Memory<[]>
   let description = [{
     Return the value of mesh built-in input.
 
-    `builtIn` is the input built-in ID of mesh shader.
+    `builtin` is the input built-in ID of mesh shader.
   }];
 }
 
@@ -315,7 +315,7 @@ def WriteMeshOutputOp : LgcOp<"write.mesh.output", [Memory<[]>]> {
   let arguments = (ins AttrI1:$is_primitive, AttrI32:$location, I32:$location_offset, I32:$component_index, I32:$prim_or_vertex_index, value:$output_value);
   let results = (outs);
 
-  let summary = "Write mesh shader primitive/vertex outputs";
+  let summary = "write mesh shader primitive/vertex outputs";
   let description = [{
     In the mesh shader, write mesh shader primitive/vertex outputs to LDS.
 
@@ -328,6 +328,108 @@ def WriteMeshOutputOp : LgcOp<"write.mesh.output", [Memory<[]>]> {
   }];
 }
 
+def NggExportPositionOp : LgcOp<"ngg.export.position", [Memory<[(write)]>]> {
+  let arguments = (ins AttrI32:$export_slot, F32:$export_value0, F32:$export_value1, F32:$export_value2, F32:$export_value3);
+  let results = (outs);
+
+  let summary = "export position in NGG primitive shader";
+  let description = [{
+    In NGG primitive shader, export position
+
+    `export_slot` is the export slot.
+    `export_value0` is the first position value to export.
+    `export_value1` is the second position value to export.
+    `export_value2` is the third position value to export.
+    `export_value3` is the fourth position value to export.
+  }];
+}
+
+def NggExportAttributeOp : LgcOp<"ngg.export.attribute", [Memory<[(write)]>]> {
+  let arguments = (ins AttrI32:$export_slot, F32:$export_value0, F32:$export_value1, F32:$export_value2, F32:$export_value3);
+  let results = (outs);
+
+  let summary = "export attribute in NGG primitive shader";
+  let description = [{
+    In NGG primitive shader, export attribute
+
+    `export_slot` is the export slot.
+    `export_value0` is the first position value to export.
+    `export_value1` is the second position value to export.
+    `export_value2` is the third position value to export.
+    `export_value3` is the fourth position value to export.
+  }];
+}
+
+def WriteXfbOutputOp : LgcOp<"write.xfb.output", [Memory<[(write)]>]> {
+  let arguments = (ins AttrI32:$xfb_buffer, AttrI32:$xfb_offset, AttrI32:$stream_id, value:$output_value);
+  let results = (outs);
+
+  let summary = "write XFB output to XFB buffer";
+  let description = [{
+    Write XFB (transform feedback) output to XFB buffer with the specified XFB offset
+
+    `xfb_buffer` is the XFB buffer.
+    `xfb_offset` is the XFB offset in this buffer.
+    `stream_id` is ID of the vertex stream to write XFB output.
+    `output_value` is the XFB output value to write.
+  }];
+}
+
+def NggReadGsOutputOp : DivergentLgcOp<"ngg.read.gs.output", [Memory<[(read)]>, WillReturn]> {
+  let arguments = (ins AttrI32:$location, AttrI32:$component, AttrI32:$stream_id);
+  let results = (outs value:$result);
+
+  let defaultBuilderHasExplicitResultType = true;
+
+  let summary = "read the value of the specified GS output from LDS";
+  let description = [{
+    Read the value of the specified GS output from LDS in NGG primitive shader
+
+    `location` is location of the output.
+    `component` is component of the output (used for vector element addressing).
+    `stream_id` is ID of the vertex stream to read GS output.
+  }];
+}
+
+def NggWriteGsOutputOp : LgcOp<"ngg.write.gs.output", [Memory<[(write)]>]> {
+  let arguments = (ins AttrI32:$location, AttrI32:$component, AttrI32:$stream_id, value:$output_value);
+  let results = (outs);
+
+  let summary = "write the value of the specified GS output to LDS";
+  let description = [{
+    Write the value of the specified GS output to LDS in NGG primitive shader
+
+    `location` is location of the output.
+    `component` is component of the output (used for vector element addressing).
+    `stream_id` is ID of the vertex stream to read GS output.
+    `output_value` is the output value to write.
+  }];
+}
+
+def GsEmitStreamOp : DivergentLgcOp<"gs.emit.stream", [Memory<[(write)]>]> {
+let arguments = (ins AttrI32:$stream_id);
+  let results = (outs);
+
+  let summary = "emit a vertex to the current output primitive at the specified stream";
+  let description = [{
+    Emit a vertex to the current output primitive at the specified stream
+
+    `stream_id` is ID of the vertex stream to emit a vertex to the current output primitive.
+  }];
+}
+
+def GsCutStreamOp : DivergentLgcOp<"gs.cut.stream", [Memory<[(write)]>]> {
+let arguments = (ins AttrI32:$stream_id);
+  let results = (outs);
+
+  let summary = "complete the current output primitive at the specified stream";
+  let description = [{
+    Complete the current output primitive at the specified stream
+
+    `stream_id` is ID of the vertex stream to complete the current output primitive.
+  }];
+}
+
 def GenericLocationOp : OpClass<LgcDialect> {
   let arguments = (ins AttrI1:$per_primitive, AttrI32:$location, I32:$loc_offset, I32:$elem_idx, I32:$array_index);
 
@@ -408,6 +510,33 @@ def OutputImportGenericOp : DivergentLgcOp<"output.import.generic", [Memory<[(re
   }];
 }
 
+def EvalIjOffsetSmoothOp : DivergentLgcOp<"eval.Ij.offset.smooth", [Memory<[]>, WillReturn]> {
+  let arguments = (ins value:$value);
+  let results = (outs value:$result);
+
+  let defaultBuilderHasExplicitResultType = true;
+
+  let summary = "evaluate I,J for interpolation: center offset, smooth (perspective) version";
+  let description = [{
+    Evaluate I,J for interpolation: center offset, smooth (perspective) version.
+
+    Only used in Fragment.
+  }];
+}
+
+def AdjustIjOp : DivergentLgcOp<"adjust.Ij", [Memory<[]>, WillReturn]> {
+  let arguments = (ins value:$value, value:$offset);
+  let results = (outs value:$result);
+
+  let defaultBuilderHasExplicitResultType = true;
+
+  let summary = "Adjust I,J values by offset.";
+  let description = [{
+    This adjusts value by its X and Y derivatives times the X and Y components of offset.
+    If value is a vector, this is done component-wise.
+  }];
+}
+
 def InputImportInterpolatedOp : DivergentLgcOp<"input.import.interpolated", [Memory<[]>, WillReturn]> {
   let superclass = GenericLocationOp;
 
@@ -747,7 +876,7 @@ def InvariantDecorationOp : LgcOp<"invariant.decoration", [WillReturn]> {
 }
 
 def CooperativeMatrixLengthOp : LgcOp<"cooperative.matrix.length", [Memory<[]>, WillReturn]> {
-  let arguments = (ins CooperativeMatrixLayout:$layout);
+  let arguments = (ins CooperativeMatrixLayout:$layout, AttrI32:$k_size);
   let results = (outs I32:$result);
 
   let summary = "get the length for the cooperative matrix";
@@ -755,12 +884,13 @@ def CooperativeMatrixLengthOp : LgcOp<"cooperative.matrix.length", [Memory<[]>,
     Get the "length" of a matrix of the given layout, i.e. the number of matrix components stored per lane.
 
     'layout' is layout of cooperative matrix.
+    'k_size' is the matrix K size.
   }];
 }
 
 def CooperativeMatrixLoadOp : DivergentLgcOp<"cooperative.matrix.load", [Memory<[(read)]>, Convergent, WillReturn]> {
   let arguments = (ins value:$pointer, value:$stride, AttrI1:$col_major, CooperativeMatrixElementType:$elem_type,
-                   CooperativeMatrixLayout:$layout, AttrI32:$memory_access, AttrI32:$alignment);
+                   CooperativeMatrixLayout:$layout, AttrI32:$memory_access, AttrI32:$alignment, AttrI32:$k_size);
   let results = (outs value:$result);
 
   let defaultBuilderHasExplicitResultType = true;
@@ -783,13 +913,14 @@ def CooperativeMatrixLoadOp : DivergentLgcOp<"cooperative.matrix.load", [Memory<
 	- Bit 2 is set if the memory is temporal.
 
 	'alignment' is the alignment of this load operation.
+  'k_size' is the matrix K size.
   }];
 }
 
 def CooperativeMatrixStoreOp : LgcOp<"cooperative.matrix.store", [Memory<[(write)]>, Convergent]> {
   let arguments = (ins value:$pointer, value:$stride, AttrI1:$col_major, CooperativeMatrixElementType:$elem_type,
                    CooperativeMatrixLayout:$layout, AttrI32:$memory_access, AttrI32:$alignment,
-                   value:$store_value);
+                   value:$store_value, AttrI32:$k_size);
   let results = (outs);
 
   let summary = "Store cooperative matrix elements per lane to the memory";
@@ -810,11 +941,12 @@ def CooperativeMatrixStoreOp : LgcOp<"cooperative.matrix.store", [Memory<[(write
 
     'alignment' is the alignment of this store operation.
     'store_value' is the elements of the cooperative matrix perlane typed in <n x i32> or <n x float> to be stored in memory.
+    'k_size' is the matrix K size.
   }];
 }
 
 def CooperativeMatrixFillOp : DivergentLgcOp<"cooperative.matrix.fill", [Memory<[]>, WillReturn]> {
-  let arguments = (ins value:$scalar, CooperativeMatrixElementType:$elem_type, CooperativeMatrixLayout:$layout);
+  let arguments = (ins value:$scalar, CooperativeMatrixElementType:$elem_type, CooperativeMatrixLayout:$layout, AttrI32:$k_size);
   let results = (outs value:$result);
 
   let defaultBuilderHasExplicitResultType = true;
@@ -826,6 +958,7 @@ def CooperativeMatrixFillOp : DivergentLgcOp<"cooperative.matrix.fill", [Memory<
     'scalar' is the value to fill the cooperative matrix.
     'elem_type' is the element type for the cooperative matrix.
     'layout' is the layout of the input cooperative matrix.
+    'k_size' is the matrix K size.
   }];
 }
 
@@ -942,7 +1075,7 @@ def CooperativeMatrixTimesScalarOp : DivergentLgcOp<"cooperative.matrix.times.sc
 def CooperativeMatrixMulAddOp : DivergentLgcOp<"cooperative.matrix.muladd", [Convergent, WillReturn]> {
   let arguments = (ins value:$matrix_a, value:$matrix_b, value:$matrix_c, AttrI1:$is_signed_a, AttrI1:$is_signed_b,
                    AttrI1:$is_sat_or_opsel, AttrI1:$is_tied, CooperativeMatrixElementType:$matrix_a_elem_type,
-                   CooperativeMatrixElementType:$matrix_b_elem_type, CooperativeMatrixElementType:$matrix_c_elem_type);
+                   CooperativeMatrixElementType:$matrix_b_elem_type, CooperativeMatrixElementType:$matrix_c_elem_type, CooperativeMatrixElementType:$matrix_d_elem_type, AttrI32:$k_multiplier);
   let results = (outs value:$result);
 
   let defaultBuilderHasExplicitResultType = true;
@@ -968,6 +1101,8 @@ def CooperativeMatrixMulAddOp : DivergentLgcOp<"cooperative.matrix.muladd", [Con
     '$matrix_a_elem_type' is the component type of the matrix A
     '$matrix_b_elem_type' is the component type of the matrix B
     '$matrix_c_elem_type' is the component type of the matrix C
+    '$matrix_d_elem_type' is the component type of the matrix D
+    '$k_multiplier' is the multiplier for the matrix K size.
   }];
 }
 
@@ -1029,7 +1164,7 @@ def SparsityIndexLoadOp : DivergentLgcOp<"sparsityindex.load", [Memory<[(read)]>
 def SparseCooperativeMatrixMulAddOp : DivergentLgcOp<"sparseCooperativeMatrix.muladd", [Convergent, WillReturn]> {
   let arguments = (ins value:$matrix_a, value:$sparse_index, value:$matrix_b, value:$matrix_c, AttrI1:$is_signed_a, AttrI1:$is_signed_b,
                    AttrI1:$is_sat, CooperativeMatrixElementType:$matrix_a_elem_type,
-                   CooperativeMatrixElementType:$matrix_b_elem_type, CooperativeMatrixElementType:$matrix_c_elem_type);
+                   CooperativeMatrixElementType:$matrix_b_elem_type, CooperativeMatrixElementType:$matrix_c_elem_type, CooperativeMatrixElementType:$matrix_d_elem_type, AttrI32:$k_multiplier);
   let results = (outs value:$result);
 
   let defaultBuilderHasExplicitResultType = true;
@@ -1052,6 +1187,8 @@ def SparseCooperativeMatrixMulAddOp : DivergentLgcOp<"sparseCooperativeMatrix.mu
     '$matrix_a_elem_type' is the component type of the A matrix.
     '$matrix_b_elem_type' is the component type of the B matrix.
     '$matrix_c_elem_type' is the component type of the C matrix.
+    '$matrix_d_elem_type' is the component type of the D matrix.
+    '$k_multiplier' is the multiplier for the matrix K size.
   }];
 }
 
diff --git a/lgc/interface/lgc/PassManager.h b/lgc/interface/lgc/PassManager.h
index 8741946ef0..67afe0d530 100644
--- a/lgc/interface/lgc/PassManager.h
+++ b/lgc/interface/lgc/PassManager.h
@@ -104,6 +104,7 @@ class MbPassManager : public llvm::PassManager<llvm::ModuleBunch> {
   // Register a pass to identify it with a short name in the pass manager
   virtual void registerPass(llvm::StringRef passName, llvm::StringRef className) = 0;
   virtual void run(llvm::ModuleBunch &moduleBunch) = 0;
+  virtual void setPassIndex(unsigned *passIndex) = 0;
   virtual bool stopped() const = 0;
 
   virtual llvm::PassInstrumentationCallbacks &getInstrumentationCallbacks() = 0;
diff --git a/lgc/interface/lgc/Pipeline.h b/lgc/interface/lgc/Pipeline.h
index c30b5f2f57..fdde94fa1d 100644
--- a/lgc/interface/lgc/Pipeline.h
+++ b/lgc/interface/lgc/Pipeline.h
@@ -129,7 +129,7 @@ static const char SampleShadingMetaName[] = "lgc.sample.shading";
 // The front-end should zero-initialize a struct with "= {}" in case future changes add new fields.
 // Note: new fields must be added to the end of this structure to maintain test compatibility.
 union Options {
-  unsigned u32All[46];
+  unsigned u32All[48];
   struct {
     uint64_t hash[2];                 // Pipeline hash to set in ELF PAL metadata
     unsigned includeDisassembly;      // If set, the disassembly for all compiled shaders will be included
@@ -205,6 +205,9 @@ union Options {
                                  // eliminating it if the write value is 1.0.
     bool enableMapClipDistMask;  // For OGL only, whether to remap the clip distances.
     unsigned clipPlaneMask;      // For OGL only, defines the bitmask for enabling/disabling clip planes.
+    unsigned reserved24;
+    bool checkRawBufferAccessDescStride; // Check descriptor stride to workaround an issue that a strided buffer desc is
+                                         // used for a raw buffer access instruction.
   };
 };
 static_assert(sizeof(Options) == sizeof(Options::u32All));
@@ -327,6 +330,9 @@ union ShaderOptions {
     /// Specifies that any shader input variables decorated as ViewIndex
     /// will be assigned values as if they were decorated as DeviceIndex.
     bool viewIndexFromDeviceIndex;
+
+    /// Force underflow prevention for log and pow
+    bool forceUnderflowPrevention;
   };
 };
 static_assert(sizeof(ShaderOptions) == sizeof(ShaderOptions::u32All));
diff --git a/lgc/patch/CollectResourceUsage.cpp b/lgc/patch/CollectResourceUsage.cpp
index 3b3ba150ce..2806ae7e11 100644
--- a/lgc/patch/CollectResourceUsage.cpp
+++ b/lgc/patch/CollectResourceUsage.cpp
@@ -32,6 +32,7 @@
 #include "MeshTaskShader.h"
 #include "NggPrimShader.h"
 #include "lgc/Builder.h"
+#include "lgc/Debug.h"
 #include "lgc/LgcDialect.h"
 #include "lgc/state/IntrinsDefs.h"
 #include "lgc/state/PalMetadata.h"
@@ -39,7 +40,6 @@
 #include "lgc/state/PipelineState.h"
 #include "lgc/state/TargetInfo.h"
 #include "lgc/util/BuilderBase.h"
-#include "lgc/util/Debug.h"
 #include "llvm-dialects/Dialect/Visitor.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -477,7 +477,7 @@ bool CollectResourceUsage::checkGsOnChipValidity() {
       break;
     }
 
-    gsResUsage->inOutUsage.gs.calcFactor.inputVertices = inVertsPerPrim;
+    gsResUsage->inOutUsage.gs.hwConfig.inputVertices = inVertsPerPrim;
   } else if (hasTs) {
     inVertsPerPrim = m_pipelineState->getNumPatchControlPoints();
   } else {
@@ -525,15 +525,15 @@ bool CollectResourceUsage::checkGsOnChipValidity() {
     assert(ldsSizeDwords <= maxHwGsLdsSizeDwords);
     (void(maxHwGsLdsSizeDwords)); // Unused
 
-    gsResUsage->inOutUsage.gs.calcFactor.esVertsPerSubgroup = 1;
-    gsResUsage->inOutUsage.gs.calcFactor.gsPrimsPerSubgroup = 1;
+    gsResUsage->inOutUsage.gs.hwConfig.esVertsPerSubgroup = 1;
+    gsResUsage->inOutUsage.gs.hwConfig.gsPrimsPerSubgroup = 1;
 
-    gsResUsage->inOutUsage.gs.calcFactor.gsOnChipLdsSize = ldsSizeDwords;
+    gsResUsage->inOutUsage.gs.hwConfig.gsOnChipLdsSize = ldsSizeDwords;
 
-    gsResUsage->inOutUsage.gs.calcFactor.esGsRingItemSize = 0;
-    gsResUsage->inOutUsage.gs.calcFactor.gsVsRingItemSize = 0;
+    gsResUsage->inOutUsage.gs.hwConfig.esGsRingItemSize = 0;
+    gsResUsage->inOutUsage.gs.hwConfig.gsVsRingItemSize = 0;
 
-    gsResUsage->inOutUsage.gs.calcFactor.primAmpFactor = primAmpFactor;
+    gsResUsage->inOutUsage.gs.hwConfig.primAmpFactor = primAmpFactor;
 
     gsOnChip = true; // For mesh shader, GS is always on-chip
   } else if (nggControl->enableNgg) {
@@ -737,23 +737,23 @@ bool CollectResourceUsage::checkGsOnChipValidity() {
     // EN_MAX_VERT_OUT_PER_GS_INSTANCE.
     assert(!hasTs || enableMaxVertOut || gsPrimsPerSubgroup >= 3);
 
-    gsResUsage->inOutUsage.gs.calcFactor.esVertsPerSubgroup = esVertsPerSubgroup;
-    gsResUsage->inOutUsage.gs.calcFactor.gsPrimsPerSubgroup = gsPrimsPerSubgroup;
+    gsResUsage->inOutUsage.gs.hwConfig.esVertsPerSubgroup = esVertsPerSubgroup;
+    gsResUsage->inOutUsage.gs.hwConfig.gsPrimsPerSubgroup = gsPrimsPerSubgroup;
 
     // EsGsLdsSize is unnecessary when there is no API GS.
-    gsResUsage->inOutUsage.gs.calcFactor.esGsLdsSize = hasGs ? expectedEsLdsSize : 0;
-    gsResUsage->inOutUsage.gs.calcFactor.gsOnChipLdsSize = needsLds ? ldsSizeDwords : 0;
+    gsResUsage->inOutUsage.gs.hwConfig.esGsLdsSize = hasGs ? expectedEsLdsSize : 0;
+    gsResUsage->inOutUsage.gs.hwConfig.gsOnChipLdsSize = needsLds ? ldsSizeDwords : 0;
 
-    gsResUsage->inOutUsage.gs.calcFactor.esGsRingItemSize = esGsRingItemSize;
-    gsResUsage->inOutUsage.gs.calcFactor.gsVsRingItemSize = gsVsRingItemSize;
+    gsResUsage->inOutUsage.gs.hwConfig.esGsRingItemSize = esGsRingItemSize;
+    gsResUsage->inOutUsage.gs.hwConfig.gsVsRingItemSize = gsVsRingItemSize;
 
     for (int i = 0; i < MaxGsStreams; ++i) {
-      gsResUsage->inOutUsage.gs.calcFactor.gsVsVertexItemSize[i] = gsVsVertexItemSize[i];
+      gsResUsage->inOutUsage.gs.hwConfig.gsVsVertexItemSize[i] = gsVsVertexItemSize[i];
     }
 
-    gsResUsage->inOutUsage.gs.calcFactor.primAmpFactor = primAmpFactor;
-    gsResUsage->inOutUsage.gs.calcFactor.enableMaxVertOut = enableMaxVertOut;
-    gsResUsage->inOutUsage.gs.calcFactor.rayQueryLdsStackSize = rayQueryLdsStackSize;
+    gsResUsage->inOutUsage.gs.hwConfig.primAmpFactor = primAmpFactor;
+    gsResUsage->inOutUsage.gs.hwConfig.enableMaxVertOut = enableMaxVertOut;
+    gsResUsage->inOutUsage.gs.hwConfig.rayQueryLdsStackSize = rayQueryLdsStackSize;
 
     gsOnChip = true; // In NGG mode, GS is always on-chip since copy shader is not present.
   } else {
@@ -917,17 +917,17 @@ bool CollectResourceUsage::checkGsOnChipValidity() {
     if (gsResUsage->useRayQueryLdsStack)
       gsPrimsPerSubgroup = std::min(gsPrimsPerSubgroup, MaxRayQueryThreadsPerGroup);
 
-    gsResUsage->inOutUsage.gs.calcFactor.esVertsPerSubgroup = esVertsPerSubgroup;
-    gsResUsage->inOutUsage.gs.calcFactor.gsPrimsPerSubgroup = gsPrimsPerSubgroup;
-    gsResUsage->inOutUsage.gs.calcFactor.esGsLdsSize = esGsLdsSize;
-    gsResUsage->inOutUsage.gs.calcFactor.gsOnChipLdsSize = gsOnChipLdsSize;
-    gsResUsage->inOutUsage.gs.calcFactor.rayQueryLdsStackSize = rayQueryLdsStackSize;
+    gsResUsage->inOutUsage.gs.hwConfig.esVertsPerSubgroup = esVertsPerSubgroup;
+    gsResUsage->inOutUsage.gs.hwConfig.gsPrimsPerSubgroup = gsPrimsPerSubgroup;
+    gsResUsage->inOutUsage.gs.hwConfig.esGsLdsSize = esGsLdsSize;
+    gsResUsage->inOutUsage.gs.hwConfig.gsOnChipLdsSize = gsOnChipLdsSize;
+    gsResUsage->inOutUsage.gs.hwConfig.rayQueryLdsStackSize = rayQueryLdsStackSize;
 
-    gsResUsage->inOutUsage.gs.calcFactor.esGsRingItemSize = esGsRingItemSize;
-    gsResUsage->inOutUsage.gs.calcFactor.gsVsRingItemSize = gsOnChip ? gsVsRingItemSizeOnChip : gsVsRingItemSize;
+    gsResUsage->inOutUsage.gs.hwConfig.esGsRingItemSize = esGsRingItemSize;
+    gsResUsage->inOutUsage.gs.hwConfig.gsVsRingItemSize = gsOnChip ? gsVsRingItemSizeOnChip : gsVsRingItemSize;
 
     for (int i = 0; i < MaxGsStreams; ++i) {
-      gsResUsage->inOutUsage.gs.calcFactor.gsVsVertexItemSize[i] = gsVsVertexItemSize[i];
+      gsResUsage->inOutUsage.gs.hwConfig.gsVsVertexItemSize[i] = gsVsVertexItemSize[i];
     }
 
     if (m_pipelineState->getTargetInfo().getGfxIpVersion().major == 10 && hasTs && !gsOnChip) {
@@ -940,10 +940,10 @@ bool CollectResourceUsage::checkGsOnChipValidity() {
         if (onChipGsLdsMagicSize > maxLdsSize) {
           // Decrease the verts
           esVertsNum = (maxLdsSize - esGsExtraLdsDwords) / esGsRingItemSize;
-          gsResUsage->inOutUsage.gs.calcFactor.gsOnChipLdsSize = maxLdsSize;
+          gsResUsage->inOutUsage.gs.hwConfig.gsOnChipLdsSize = maxLdsSize;
         } else {
           // Increase the size
-          gsResUsage->inOutUsage.gs.calcFactor.gsOnChipLdsSize = onChipGsLdsMagicSize;
+          gsResUsage->inOutUsage.gs.hwConfig.gsOnChipLdsSize = onChipGsLdsMagicSize;
         }
       }
       // Support multiple GS instances
@@ -957,8 +957,8 @@ bool CollectResourceUsage::checkGsOnChipValidity() {
       if (gsResUsage->useRayQueryLdsStack)
         gsPrimsNum = std::min(gsPrimsNum, MaxRayQueryThreadsPerGroup);
 
-      gsResUsage->inOutUsage.gs.calcFactor.esVertsPerSubgroup = esVertsNum;
-      gsResUsage->inOutUsage.gs.calcFactor.gsPrimsPerSubgroup = gsPrimsNum;
+      gsResUsage->inOutUsage.gs.hwConfig.esVertsPerSubgroup = esVertsNum;
+      gsResUsage->inOutUsage.gs.hwConfig.gsPrimsPerSubgroup = gsPrimsNum;
     }
   }
 
@@ -974,31 +974,31 @@ bool CollectResourceUsage::checkGsOnChipValidity() {
   }
   LLPC_OUTS("\n");
 
-  LLPC_OUTS("EsVerts = " << gsResUsage->inOutUsage.gs.calcFactor.esVertsPerSubgroup << " verts/subgroup\n");
-  LLPC_OUTS("GsPrims = " << gsResUsage->inOutUsage.gs.calcFactor.gsPrimsPerSubgroup << " prims/subgroup\n");
+  LLPC_OUTS("EsVerts = " << gsResUsage->inOutUsage.gs.hwConfig.esVertsPerSubgroup << " verts/subgroup\n");
+  LLPC_OUTS("GsPrims = " << gsResUsage->inOutUsage.gs.hwConfig.gsPrimsPerSubgroup << " prims/subgroup\n");
   LLPC_OUTS("\n");
 
-  LLPC_OUTS("EsGsLdsSize = " << gsResUsage->inOutUsage.gs.calcFactor.esGsLdsSize << " dwords\n");
-  LLPC_OUTS("GsOnchipLdsSize = " << gsResUsage->inOutUsage.gs.calcFactor.gsOnChipLdsSize << " dwords\n");
-  if (gsResUsage->inOutUsage.gs.calcFactor.rayQueryLdsStackSize > 0) {
-    LLPC_OUTS("RayQueryLdsStack = " << gsResUsage->inOutUsage.gs.calcFactor.rayQueryLdsStackSize << " dwords (Start = "
-                                    << gsResUsage->inOutUsage.gs.calcFactor.gsOnChipLdsSize << ")\n");
+  LLPC_OUTS("EsGsLdsSize = " << gsResUsage->inOutUsage.gs.hwConfig.esGsLdsSize << " dwords\n");
+  LLPC_OUTS("GsOnchipLdsSize = " << gsResUsage->inOutUsage.gs.hwConfig.gsOnChipLdsSize << " dwords\n");
+  if (gsResUsage->inOutUsage.gs.hwConfig.rayQueryLdsStackSize > 0) {
+    LLPC_OUTS("RayQueryLdsStack = " << gsResUsage->inOutUsage.gs.hwConfig.rayQueryLdsStackSize << " dwords (Start = "
+                                    << gsResUsage->inOutUsage.gs.hwConfig.gsOnChipLdsSize << ")\n");
   }
   LLPC_OUTS("\n");
 
-  LLPC_OUTS("EsGsRingItemSize = " << gsResUsage->inOutUsage.gs.calcFactor.esGsRingItemSize << " dwords\n");
-  LLPC_OUTS("GsVsRingItemSize = " << gsResUsage->inOutUsage.gs.calcFactor.gsVsRingItemSize << " dwords\n");
+  LLPC_OUTS("EsGsRingItemSize = " << gsResUsage->inOutUsage.gs.hwConfig.esGsRingItemSize << " dwords\n");
+  LLPC_OUTS("GsVsRingItemSize = " << gsResUsage->inOutUsage.gs.hwConfig.gsVsRingItemSize << " dwords\n");
   LLPC_OUTS("GsVsVertexItemSizes = [");
   for (unsigned i = 0; i < MaxGsStreams; ++i) {
-    LLPC_OUTS(gsResUsage->inOutUsage.gs.calcFactor.gsVsVertexItemSize[i]);
+    LLPC_OUTS(gsResUsage->inOutUsage.gs.hwConfig.gsVsVertexItemSize[i]);
     LLPC_OUTS((i == MaxGsStreams - 1 ? "" : ", "));
   }
   LLPC_OUTS("] dwords\n");
   LLPC_OUTS("\n");
 
   if (meshPipeline || m_pipelineState->getNggControl()->enableNgg) {
-    LLPC_OUTS("PrimAmpFactor = " << gsResUsage->inOutUsage.gs.calcFactor.primAmpFactor << "\n");
-    LLPC_OUTS("EnableMaxVertOut = " << (gsResUsage->inOutUsage.gs.calcFactor.enableMaxVertOut ? "true" : "false")
+    LLPC_OUTS("PrimAmpFactor = " << gsResUsage->inOutUsage.gs.hwConfig.primAmpFactor << "\n");
+    LLPC_OUTS("EnableMaxVertOut = " << (gsResUsage->inOutUsage.gs.hwConfig.enableMaxVertOut ? "true" : "false")
                                     << "\n");
     LLPC_OUTS("\n");
   }
@@ -1055,8 +1055,7 @@ bool CollectResourceUsage::checkGsOnChipValidity() {
 
     const auto &streamXfbBuffers = m_pipelineState->getStreamXfbBuffers();
     for (unsigned i = 0; i < MaxGsStreams; ++i) {
-      unsigned streamItemSize =
-          gsResUsage->inOutUsage.gs.calcFactor.gsVsVertexItemSize[i] * geometryMode.outputVertices;
+      unsigned streamItemSize = gsResUsage->inOutUsage.gs.hwConfig.gsVsVertexItemSize[i] * geometryMode.outputVertices;
       LLPC_OUTS("Stream[" << i << "] = " << streamItemSize << " dwords");
       if (streamItemSize == 0)
         LLPC_OUTS(" (Inactive)");
@@ -1304,8 +1303,8 @@ void CollectResourceUsage::visitCallInst(CallInst &callInst) {
         }
       }
     }
-  } else if (mangledName.starts_with(lgcName::OutputExportXfb)) {
-    auto outputValue = callInst.getArgOperand(callInst.arg_size() - 1);
+  } else if (isa<WriteXfbOutputOp>(callInst)) {
+    auto outputValue = cast<WriteXfbOutputOp>(callInst).getOutputValue();
     if (isa<UndefValue>(outputValue) || isa<PoisonValue>(outputValue)) {
       // NOTE: If an output value is unspecified, we can safely drop it and remove the transform feedback export call.
       m_deadCalls.push_back(&callInst);
@@ -2135,6 +2134,11 @@ void CollectResourceUsage::mapBuiltInToGenericInOut() {
         const unsigned mapLoc = nextInOutUsage.builtInInputLocMap[BuiltInLayer];
         inOutUsage.builtInOutputLocMap[BuiltInLayer] = mapLoc;
         availOutMapLoc = std::max(availOutMapLoc, mapLoc + 1);
+      } else {
+        if (m_importedOutputBuiltIns.find(BuiltInLayer) != m_importedOutputBuiltIns.end())
+          inOutUsage.builtInOutputLocMap[BuiltInLayer] = InvalidValue;
+        else
+          builtInUsage.tcs.layer = false;
       }
 
       if (nextBuiltInUsage.viewportIndexIn) {
@@ -2142,16 +2146,24 @@ void CollectResourceUsage::mapBuiltInToGenericInOut() {
         const unsigned mapLoc = nextInOutUsage.builtInInputLocMap[BuiltInViewportIndex];
         inOutUsage.builtInOutputLocMap[BuiltInViewportIndex] = mapLoc;
         availOutMapLoc = std::max(availOutMapLoc, mapLoc + 1);
+      } else {
+        if (m_importedOutputBuiltIns.find(BuiltInViewportIndex) != m_importedOutputBuiltIns.end())
+          inOutUsage.builtInOutputLocMap[BuiltInViewportIndex] = InvalidValue;
+        else
+          builtInUsage.tcs.viewportIndex = false;
       }
 
-      // NOTE: We shouldn't clear the usage of tessellation levels if the next stage doesn't read them back because they
-      // are always required to be written to TF buffer.
       if (nextBuiltInUsage.tessLevelOuter) {
         assert(nextInOutUsage.perPatchBuiltInInputLocMap.find(BuiltInTessLevelOuter) !=
                nextInOutUsage.perPatchBuiltInInputLocMap.end());
         const unsigned mapLoc = nextInOutUsage.perPatchBuiltInInputLocMap[BuiltInTessLevelOuter];
         inOutUsage.perPatchBuiltInOutputLocMap[BuiltInTessLevelOuter] = mapLoc;
         availPerPatchOutMapLoc = std::max(availPerPatchOutMapLoc, mapLoc + 1);
+      } else {
+        if (m_importedOutputBuiltIns.find(BuiltInTessLevelOuter) != m_importedOutputBuiltIns.end())
+          inOutUsage.perPatchBuiltInOutputLocMap[BuiltInTessLevelOuter] = InvalidValue;
+        else
+          builtInUsage.tcs.tessLevelOuter = false;
       }
 
       if (nextBuiltInUsage.tessLevelInner) {
@@ -2160,6 +2172,11 @@ void CollectResourceUsage::mapBuiltInToGenericInOut() {
         const unsigned mapLoc = nextInOutUsage.perPatchBuiltInInputLocMap[BuiltInTessLevelInner];
         inOutUsage.perPatchBuiltInOutputLocMap[BuiltInTessLevelInner] = mapLoc;
         availPerPatchOutMapLoc = std::max(availPerPatchOutMapLoc, mapLoc + 1);
+      } else {
+        if (m_importedOutputBuiltIns.find(BuiltInTessLevelInner) != m_importedOutputBuiltIns.end())
+          inOutUsage.perPatchBuiltInOutputLocMap[BuiltInTessLevelInner] = InvalidValue;
+        else
+          builtInUsage.tcs.tessLevelInner = false;
       }
 
       // Revisit built-in outputs and map those unmapped to generic ones
@@ -2178,6 +2195,24 @@ void CollectResourceUsage::mapBuiltInToGenericInOut() {
       if (inOutUsage.builtInOutputLocMap.find(BuiltInCullDistance) != inOutUsage.builtInOutputLocMap.end() &&
           inOutUsage.builtInOutputLocMap[BuiltInCullDistance] == InvalidValue)
         inOutUsage.builtInOutputLocMap[BuiltInCullDistance] = availOutMapLoc++;
+
+      if (inOutUsage.builtInOutputLocMap.find(BuiltInLayer) != inOutUsage.builtInOutputLocMap.end() &&
+          inOutUsage.builtInOutputLocMap[BuiltInLayer] == InvalidValue)
+        inOutUsage.builtInOutputLocMap[BuiltInLayer] = availOutMapLoc++;
+
+      if (inOutUsage.builtInOutputLocMap.find(BuiltInViewportIndex) != inOutUsage.builtInOutputLocMap.end() &&
+          inOutUsage.builtInOutputLocMap[BuiltInViewportIndex] == InvalidValue)
+        inOutUsage.builtInOutputLocMap[BuiltInViewportIndex] = availOutMapLoc++;
+
+      if (inOutUsage.perPatchBuiltInOutputLocMap.find(BuiltInTessLevelOuter) !=
+              inOutUsage.perPatchBuiltInOutputLocMap.end() &&
+          inOutUsage.perPatchBuiltInOutputLocMap[BuiltInTessLevelOuter] == InvalidValue)
+        inOutUsage.perPatchBuiltInOutputLocMap[BuiltInTessLevelOuter] = availPerPatchOutMapLoc++;
+
+      if (inOutUsage.perPatchBuiltInOutputLocMap.find(BuiltInTessLevelInner) !=
+              inOutUsage.perPatchBuiltInOutputLocMap.end() &&
+          inOutUsage.perPatchBuiltInOutputLocMap[BuiltInTessLevelInner] == InvalidValue)
+        inOutUsage.perPatchBuiltInOutputLocMap[BuiltInTessLevelInner] = availPerPatchOutMapLoc++;
     } else if (!nextStage) {
       // TCS only
       if (builtInUsage.tcs.position)
diff --git a/lgc/patch/Continufy.cpp b/lgc/patch/Continufy.cpp
index 5b70f69a83..c72c298af9 100644
--- a/lgc/patch/Continufy.cpp
+++ b/lgc/patch/Continufy.cpp
@@ -198,7 +198,7 @@ PreservedAnalyses Continufy::run(Module &module, ModuleAnalysisManager &analysis
             tailArgs.push_back(retValue);
 
           builder.create<JumpOp>(fnPtr->getArg(1), getReturnedLevels(currentRtStage.value()), poisonI32 /* csp */,
-                                 poisonI32 /* rcr */, tailArgs);
+                                 poisonI32 /* shaderRecIdx */, poisonI32 /* rcr */, tailArgs);
         }
 
         builder.CreateUnreachable();
diff --git a/lgc/patch/FragmentColorExport.cpp b/lgc/patch/FragmentColorExport.cpp
index 3d1ae97140..af84f3373c 100644
--- a/lgc/patch/FragmentColorExport.cpp
+++ b/lgc/patch/FragmentColorExport.cpp
@@ -25,7 +25,7 @@
 /**
  ***********************************************************************************************************************
  * @file  FragmentColorExport.cpp
- * @brief LLPC source file: contains implementation of class lgc::FragColorExport.
+ * @brief LLPC source file: contains implementation of class lgc::FragmentColorExport.
  ***********************************************************************************************************************
  */
 #include "lgc/patch/FragmentColorExport.h"
@@ -56,11 +56,11 @@ namespace lgc {
 //
 // @param context : LLVM context
 // @param pipelineState : Pipeline state
-FragColorExport::FragColorExport(LgcContext *context) : m_lgcContext(context) {
+FragmentColorExport::FragmentColorExport(LgcContext *context) : m_lgcContext(context) {
 }
 
 // =====================================================================================================================
-LowerFragColorExport::LowerFragColorExport() : m_exportValues(MaxColorTargets + 1, nullptr) {
+LowerFragmentColorExport::LowerFragmentColorExport() : m_exportValues(MaxColorTargets + 1, nullptr) {
 }
 
 // =====================================================================================================================
@@ -95,9 +95,9 @@ static void extractElements(Value *input, BuilderBase &builder, std::array<Value
 // @param expFmt: The format for the given render target
 // @param signedness: If output should be interpreted as a signed integer
 // @param isDualSource: If it's under dualSourceBlend, it should be true
-Value *FragColorExport::handleColorExportInstructions(Value *output, unsigned hwColorExport, BuilderBase &builder,
-                                                      ExportFormat expFmt, const bool signedness,
-                                                      const bool isDualSource) {
+Value *FragmentColorExport::handleColorExportInstructions(Value *output, unsigned hwColorExport, BuilderBase &builder,
+                                                          ExportFormat expFmt, const bool signedness,
+                                                          const bool isDualSource) {
   assert(expFmt != EXP_FORMAT_ZERO);
 
   Type *outputTy = output->getType();
@@ -298,7 +298,7 @@ Value *FragColorExport::handleColorExportInstructions(Value *output, unsigned hw
 // @param value : Output value
 // @param signedness : Whether the type is signed (valid for integer type)
 // @param builder : The IR builder for inserting instructions
-Value *FragColorExport::convertToHalf(Value *value, bool signedness, BuilderBase &builder) const {
+Value *FragmentColorExport::convertToHalf(Value *value, bool signedness, BuilderBase &builder) const {
   Type *valueTy = value->getType();
   unsigned numElements = valueTy->isVectorTy() ? cast<FixedVectorType>(valueTy)->getNumElements() : 1;
   const unsigned bitWidth = valueTy->getScalarSizeInBits();
@@ -331,7 +331,7 @@ Value *FragColorExport::convertToHalf(Value *value, bool signedness, BuilderBase
 // @param value : Output value
 // @param signedness : Whether the type is signed (valid for integer type)
 // @param builder : The IR builder for inserting instructions
-Value *FragColorExport::convertToFloat(Value *value, bool signedness, BuilderBase &builder) const {
+Value *FragmentColorExport::convertToFloat(Value *value, bool signedness, BuilderBase &builder) const {
   Type *valueTy = value->getType();
   const unsigned bitWidth = valueTy->getScalarSizeInBits();
   unsigned numElements = valueTy->isVectorTy() ? cast<FixedVectorType>(valueTy)->getNumElements() : 1;
@@ -376,7 +376,7 @@ Value *FragColorExport::convertToFloat(Value *value, bool signedness, BuilderBas
 // @param value : Output component value
 // @param signedness : Whether the type is signed (valid for integer type)
 // @param builder : The IR builder for inserting instructions
-Value *FragColorExport::convertToInt(Value *value, bool signedness, BuilderBase &builder) const {
+Value *FragmentColorExport::convertToInt(Value *value, bool signedness, BuilderBase &builder) const {
   Type *valueTy = value->getType();
   const unsigned bitWidth = valueTy->getScalarSizeInBits();
   unsigned numElements = valueTy->isVectorTy() ? cast<FixedVectorType>(valueTy)->getNumElements() : 1;
@@ -419,7 +419,7 @@ Value *FragColorExport::convertToInt(Value *value, bool signedness, BuilderBase
 // @param [in/out] module : LLVM module to be run on
 // @param [in/out] analysisManager : Analysis manager to use for this transformation
 // @returns : The preserved analyses (The analyses that are still valid after this pass)
-PreservedAnalyses LowerFragColorExport::run(Module &module, ModuleAnalysisManager &analysisManager) {
+PreservedAnalyses LowerFragmentColorExport::run(Module &module, ModuleAnalysisManager &analysisManager) {
   PipelineState *pipelineState = analysisManager.getResult<PipelineStateWrapper>(module).getPipelineState();
   PipelineShadersResult &pipelineShaders = analysisManager.getResult<PipelineShaders>(module);
 
@@ -487,10 +487,10 @@ PreservedAnalyses LowerFragColorExport::run(Module &module, ModuleAnalysisManage
     return PreservedAnalyses::none();
   }
 
-  FragColorExport fragColorExport(m_pipelineState->getLgcContext());
+  FragmentColorExport fragColorExport(m_pipelineState->getLgcContext());
   bool dummyExport = m_resUsage->builtInUsage.fs.discard || m_pipelineState->getOptions().forceFragColorDummyExport ||
                      m_pipelineState->getShaderModes()->getFragmentShaderMode().enablePops;
-  FragColorExport::Key key = FragColorExport::computeKey(m_info, m_pipelineState);
+  FragmentColorExport::Key key = FragmentColorExport::computeKey(m_info, m_pipelineState);
   fragColorExport.generateExportInstructions(m_info, m_exportValues, dummyExport, m_pipelineState->getPalMetadata(),
                                              builder, dynamicIsDualSource, key);
   return (!m_info.empty() || dummyExport) ? PreservedAnalyses::none() : PreservedAnalyses::all();
@@ -502,8 +502,8 @@ PreservedAnalyses LowerFragColorExport::run(Module &module, ModuleAnalysisManage
 // @param callInst : An call to the generic output export builtin in a fragment shader.
 // @param [in/out] outFragColors : An array with the current color output information for each color output location.
 // @param builder : builder to use
-void LowerFragColorExport::updateFragColors(CallInst *callInst, MutableArrayRef<ColorOutputValueInfo> outFragColors,
-                                            BuilderBase &builder) {
+void LowerFragmentColorExport::updateFragColors(CallInst *callInst, MutableArrayRef<ColorOutputValueInfo> outFragColors,
+                                                BuilderBase &builder) {
   Value *output = callInst->getOperand(2);
   if (isa<UndefValue>(output))
     return;
@@ -539,8 +539,8 @@ void LowerFragColorExport::updateFragColors(CallInst *callInst, MutableArrayRef<
 //
 // @param fragEntryPoint : The fragment shader to which we should add the export instructions.
 // @param builder : The builder object that will be used to create new instructions.
-void LowerFragColorExport::collectExportInfoForGenericOutputs(Function *fragEntryPoint, BuilderBase &builder) {
-  std::unique_ptr<FragColorExport> fragColorExport(new FragColorExport(m_pipelineState->getLgcContext()));
+void LowerFragmentColorExport::collectExportInfoForGenericOutputs(Function *fragEntryPoint, BuilderBase &builder) {
+  std::unique_ptr<FragmentColorExport> fragColorExport(new FragmentColorExport(m_pipelineState->getLgcContext()));
   SmallVector<CallInst *, 8> colorExports;
 
   // Collect all of the exports in the fragment shader
@@ -607,7 +607,7 @@ void LowerFragColorExport::collectExportInfoForGenericOutputs(Function *fragEntr
 //
 // @param fragEntryPoint : The fragment shader to which we should add the export instructions.
 // @param builder : The builder object that will be used to create new instructions.
-void LowerFragColorExport::createTailJump(Function *fragEntryPoint, BuilderBase &builder, Value *isDualSource) {
+void LowerFragmentColorExport::createTailJump(Function *fragEntryPoint, BuilderBase &builder, Value *isDualSource) {
   // Add the export info to be used when linking shaders to generate the color export shader and compute the spi shader
   // color format in the metadata.
   m_pipelineState->getPalMetadata()->addColorExportInfo(m_info);
@@ -639,14 +639,13 @@ void LowerFragColorExport::createTailJump(Function *fragEntryPoint, BuilderBase
 
   if (m_pipelineState->getOptions().enableColorExportShader) {
     // Build color export function type
-    auto funcTy = FunctionType::get(builder.getVoidTy(), outputTypes, false);
-    // Convert color export shader address to function pointer
-    auto funcTyPtr = funcTy->getPointerTo(ADDR_SPACE_CONST);
+    auto funcTyPtr = builder.getPtrTy(ADDR_SPACE_CONST);
     auto colorShaderAddr = ShaderInputs::getSpecialUserData(UserDataMapping::ColorExportAddr, builder);
     AddressExtender addrExt(builder.GetInsertPoint()->getParent()->getParent());
     auto funcPtr = addrExt.extendWithPc(colorShaderAddr, funcTyPtr, builder);
 
     // Jump
+    auto funcTy = FunctionType::get(builder.getVoidTy(), outputTypes, false);
     auto callInst = builder.CreateCall(funcTy, funcPtr, cesArgs);
     callInst->setCallingConv(CallingConv::AMDGPU_Gfx);
     callInst->addParamAttr(returnLocation, Attribute::InReg);
@@ -671,7 +670,7 @@ void LowerFragColorExport::createTailJump(Function *fragEntryPoint, BuilderBase
 //
 // @param fragEntryPoint : The fragment shader to which we should add the export instructions.
 // @param builder : The builder object that will be used to create new instructions.
-void LowerFragColorExport::collectExportInfoForBuiltinOutput(Function *module, BuilderBase &builder) {
+void LowerFragmentColorExport::collectExportInfoForBuiltinOutput(Function *module, BuilderBase &builder) {
   // Collect calls to the builtins
   Value *m_fragDepth = nullptr;
   Value *m_fragStencilRef = nullptr;
@@ -763,7 +762,7 @@ void LowerFragColorExport::collectExportInfoForBuiltinOutput(Function *module, B
 //
 // @param [in/out] exportInst : The export instruction to be updated.
 // @param builder : The builder object that will be used to create new instructions.
-void FragColorExport::setDoneFlag(Value *exportInst, BuilderBase &builder) {
+void FragmentColorExport::setDoneFlag(Value *exportInst, BuilderBase &builder) {
   if (!exportInst)
     return;
 
@@ -784,7 +783,7 @@ void FragColorExport::setDoneFlag(Value *exportInst, BuilderBase &builder) {
 // Swizzle the output to MRT0/MRT1 for dual source blend on GFX11+, and return the last export instruction.
 //
 // @param builder : The builder object that will be used to create new instructions.
-Value *FragColorExport::dualSourceSwizzle(unsigned waveSize, BuilderBase &builder) {
+Value *FragmentColorExport::dualSourceSwizzle(unsigned waveSize, BuilderBase &builder) {
   Value *result0[4], *result1[4];
   auto undefFloat = PoisonValue::get(builder.getFloatTy());
 
@@ -865,9 +864,11 @@ Value *FragColorExport::dualSourceSwizzle(unsigned waveSize, BuilderBase &builde
 // @param needMrt0a: The flag to tell MRT0.a is required.
 // @param pCbShaderMask: The cbShaderMask after update color export information
 // @param [out] outExpinfo : The updated color export information when enableFragColor is true.
-void FragColorExport::updateColorExportInfoWithBroadCastInfo(const Key &key, ArrayRef<ColorExportInfo> originExpinfo,
-                                                             bool needMrt0a, SmallVector<ColorExportInfo> &outExpinfo,
-                                                             unsigned *pCbShaderMask) {
+void FragmentColorExport::updateColorExportInfoWithBroadCastInfo(const Key &key,
+                                                                 ArrayRef<ColorExportInfo> originExpinfo,
+                                                                 bool needMrt0a,
+                                                                 SmallVector<ColorExportInfo> &outExpinfo,
+                                                                 unsigned *pCbShaderMask) {
   // As enableFragColor will only be enabled by OGL, so it will not consider on the dualSource cases.
   SmallVector<ColorExportInfo> broadCastInfo;
   if (key.enableFragColor) {
@@ -904,9 +905,9 @@ void FragColorExport::updateColorExportInfoWithBroadCastInfo(const Key &key, Arr
 // @param builder : The builder object that will be used to create new instructions.
 // @param dynamicIsDualSource: Identify whether it's in dynamicDualSourceBlend state
 // @param key: Color export Info
-void FragColorExport::generateExportInstructions(ArrayRef<ColorExportInfo> info, ArrayRef<Value *> values,
-                                                 bool dummyExport, PalMetadata *palMetadata, BuilderBase &builder,
-                                                 Value *dynamicIsDualSource, const Key &key) {
+void FragmentColorExport::generateExportInstructions(ArrayRef<ColorExportInfo> info, ArrayRef<Value *> values,
+                                                     bool dummyExport, PalMetadata *palMetadata, BuilderBase &builder,
+                                                     Value *dynamicIsDualSource, const Key &key) {
   Value *lastExport = nullptr;
   unsigned gfxip = m_lgcContext->getTargetInfo().getGfxIpVersion().major;
 
@@ -1016,7 +1017,7 @@ void FragColorExport::generateExportInstructions(ArrayRef<ColorExportInfo> info,
 
     if (m_blendSourceChannels > 0) {
       lastExport = dualSourceSwizzle(key.waveSize, builder);
-      FragColorExport::setDoneFlag(lastExport, builder);
+      FragmentColorExport::setDoneFlag(lastExport, builder);
     }
     builder.CreateRetVoid();
   }
@@ -1075,7 +1076,7 @@ void FragColorExport::generateExportInstructions(ArrayRef<ColorExportInfo> info,
       }
     }
     if (lastExport)
-      FragColorExport::setDoneFlag(lastExport, builder);
+      FragmentColorExport::setDoneFlag(lastExport, builder);
     builder.CreateRetVoid();
   }
 
@@ -1093,7 +1094,7 @@ void FragColorExport::generateExportInstructions(ArrayRef<ColorExportInfo> info,
 // @param value : The value to be modified.
 // @param outputTy : The type that the value should be converted to.
 // @param builder : The builder object that will be used to create new instructions.
-Value *LowerFragColorExport::generateValueForOutput(Value *value, Type *outputTy, BuilderBase &builder) {
+Value *LowerFragmentColorExport::generateValueForOutput(Value *value, Type *outputTy, BuilderBase &builder) {
   unsigned originalSize = value->getType()->getPrimitiveSizeInBits();
   unsigned finalSize = outputTy->getPrimitiveSizeInBits();
   if (originalSize < finalSize) {
@@ -1117,8 +1118,8 @@ Value *LowerFragColorExport::generateValueForOutput(Value *value, Type *outputTy
 // @param [in/out] module : The LLVM module in which to add the shader.
 // @param pipelineState : Pipeline state.
 // @returns : the entry point for the null fragment shader.
-Function *FragColorExport::generateNullFragmentShader(Module &module, PipelineState *pipelineState,
-                                                      StringRef entryPointName) {
+Function *FragmentColorExport::generateNullFragmentShader(Module &module, PipelineState *pipelineState,
+                                                          StringRef entryPointName) {
   Function *entryPoint = generateNullFragmentEntryPoint(module, pipelineState, entryPointName);
   generateNullFragmentShaderBody(entryPoint);
   return entryPoint;
@@ -1130,8 +1131,8 @@ Function *FragColorExport::generateNullFragmentShader(Module &module, PipelineSt
 // @param [in/out] module : The LLVM module in which to add the entry point.
 // @param pipelineState : Pipeline state.
 // @returns : The new entry point.
-Function *FragColorExport::generateNullFragmentEntryPoint(Module &module, PipelineState *pipelineState,
-                                                          StringRef entryPointName) {
+Function *FragmentColorExport::generateNullFragmentEntryPoint(Module &module, PipelineState *pipelineState,
+                                                              StringRef entryPointName) {
   FunctionType *entryPointTy = FunctionType::get(Type::getVoidTy(module.getContext()), ArrayRef<Type *>(), false);
   Function *entryPoint = Function::Create(entryPointTy, GlobalValue::ExternalLinkage, entryPointName, &module);
   entryPoint->setDLLStorageClass(GlobalValue::DLLExportStorageClass);
@@ -1146,7 +1147,7 @@ Function *FragColorExport::generateNullFragmentEntryPoint(Module &module, Pipeli
 // Generate the body of the null fragment shader.
 //
 // @param [in/out] entryPoint : The function in which the code will be inserted.
-void FragColorExport::generateNullFragmentShaderBody(llvm::Function *entryPoint) {
+void FragmentColorExport::generateNullFragmentShaderBody(llvm::Function *entryPoint) {
   BasicBlock *block = BasicBlock::Create(entryPoint->getContext(), "", entryPoint);
   BuilderBase builder(block);
   builder.CreateRetVoid();
@@ -1158,8 +1159,9 @@ void FragColorExport::generateNullFragmentShaderBody(llvm::Function *entryPoint)
 // @param info : The color export information for each color export in no particular order.
 // @param pipelineState : Pipeline state
 // @returns : Color export info.
-FragColorExport::Key FragColorExport::computeKey(ArrayRef<ColorExportInfo> infos, PipelineState *pipelineState) {
-  FragColorExport::Key key = {};
+FragmentColorExport::Key FragmentColorExport::computeKey(ArrayRef<ColorExportInfo> infos,
+                                                         PipelineState *pipelineState) {
+  FragmentColorExport::Key key = {};
   key.enableFragColor = pipelineState->getOptions().enableFragColor;
   key.colorExportState = pipelineState->getColorExportState();
   key.waveSize = pipelineState->getShaderWaveSize(ShaderStage::Fragment);
diff --git a/lgc/patch/GenerateCopyShader.cpp b/lgc/patch/GenerateCopyShader.cpp
index e22e2250ca..62ff383581 100644
--- a/lgc/patch/GenerateCopyShader.cpp
+++ b/lgc/patch/GenerateCopyShader.cpp
@@ -29,6 +29,7 @@
  ***********************************************************************************************************************
  */
 #include "lgc/patch/GenerateCopyShader.h"
+#include "lgc/LgcDialect.h"
 #include "lgc/state/IntrinsDefs.h"
 #include "lgc/state/PalMetadata.h"
 #include "lgc/state/PipelineShaders.h"
@@ -486,7 +487,7 @@ Value *GenerateCopyShader::calcGsVsRingOffsetForInput(unsigned location, unsigne
   Value *ringOffset = nullptr;
   if (m_pipelineState->isGsOnChip()) {
     // ringOffset = esGsLdsSize + vertexOffset + location * 4 + compIdx
-    ringOffset = builder.getInt32(resUsage->inOutUsage.gs.calcFactor.esGsLdsSize);
+    ringOffset = builder.getInt32(resUsage->inOutUsage.gs.hwConfig.esGsLdsSize);
     ringOffset = builder.CreateAdd(ringOffset, vertexOffset);
     ringOffset = builder.CreateAdd(ringOffset, builder.getInt32(location * 4 + compIdx));
   } else {
@@ -533,15 +534,8 @@ Value *GenerateCopyShader::loadValueFromGsVsRing(Type *loadTy, unsigned location
   else
     assert(elemCount + component <= 4);
 
-  if (m_pipelineState->getNggControl()->enableNgg) {
-    // NOTE: For NGG, reading GS output from GS-VS ring is represented by a call and the call is replaced with
-    // real instructions when when NGG primitive shader is generated.
-    std::string callName(lgcName::NggReadGsOutput);
-    callName += getTypeName(loadTy);
-    return builder.CreateNamedCall(
-        callName, loadTy, {builder.getInt32(location), builder.getInt32(component), builder.getInt32(streamId)},
-        {Attribute::Speculatable, Attribute::ReadOnly, Attribute::WillReturn});
-  }
+  if (m_pipelineState->getNggControl()->enableNgg)
+    return builder.create<NggReadGsOutputOp>(loadTy, location, component, streamId);
 
   // NOTE: NGG with GS must have been handled. Here we only handle pre-GFX11 generations with legacy pipeline.
   assert(m_pipelineState->getTargetInfo().getGfxIpVersion().major < 11);
@@ -632,12 +626,7 @@ void GenerateCopyShader::exportXfbOutput(Value *outputValue, const XfbOutInfo &x
     inOutUsage.xfbExpCount += outputValue->getType()->getPrimitiveSizeInBits() > 128 ? 2 : 1;
   }
 
-  Value *args[] = {builder.getInt32(xfbOutInfo.xfbBuffer), builder.getInt32(xfbOutInfo.xfbOffset),
-                   builder.getInt32(xfbOutInfo.streamId), outputValue};
-
-  std::string instName(lgcName::OutputExportXfb);
-  addTypeMangling(nullptr, args, instName);
-  builder.CreateNamedCall(instName, builder.getVoidTy(), args, {});
+  builder.create<WriteXfbOutputOp>(xfbOutInfo.xfbBuffer, xfbOutInfo.xfbOffset, xfbOutInfo.streamId, outputValue);
 }
 
 // =====================================================================================================================
@@ -668,11 +657,7 @@ void GenerateCopyShader::exportBuiltInOutput(Value *outputValue, BuiltInKind bui
       }
 
       const auto &xfbOutInfo = locInfoXfbOutInfoMapIt->second;
-      std::string instName(lgcName::OutputExportXfb);
-      Value *args[] = {builder.getInt32(xfbOutInfo.xfbBuffer), builder.getInt32(xfbOutInfo.xfbOffset),
-                       builder.getInt32(0), outputValue};
-      addTypeMangling(nullptr, args, instName);
-      builder.CreateNamedCall(instName, builder.getVoidTy(), args, {});
+      builder.create<WriteXfbOutputOp>(xfbOutInfo.xfbBuffer, xfbOutInfo.xfbOffset, 0, outputValue);
     }
   }
 
diff --git a/lgc/patch/GenerateNullFragmentShader.cpp b/lgc/patch/GenerateNullFragmentShader.cpp
index 66a1cea598..a8157ba9b5 100644
--- a/lgc/patch/GenerateNullFragmentShader.cpp
+++ b/lgc/patch/GenerateNullFragmentShader.cpp
@@ -68,7 +68,7 @@ PreservedAnalyses GenerateNullFragmentShader::run(Module &module, ModuleAnalysis
   if (hasFs || !pipelineState->isGraphics())
     return PreservedAnalyses::all();
 
-  FragColorExport::generateNullFragmentShader(module, pipelineState, lgcName::NullFsEntryPoint);
+  FragmentColorExport::generateNullFragmentShader(module, pipelineState, lgcName::NullFsEntryPoint);
   updatePipelineState(pipelineState);
   return PreservedAnalyses::none();
 }
diff --git a/lgc/patch/PatchInitializeWorkgroupMemory.cpp b/lgc/patch/InitializeWorkgroupMemory.cpp
similarity index 95%
rename from lgc/patch/PatchInitializeWorkgroupMemory.cpp
rename to lgc/patch/InitializeWorkgroupMemory.cpp
index 1d5cc9ce8f..76fe3fc4a9 100644
--- a/lgc/patch/PatchInitializeWorkgroupMemory.cpp
+++ b/lgc/patch/InitializeWorkgroupMemory.cpp
@@ -24,12 +24,12 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchInitializeWorkgroupMemory.cpp
- * @brief LLPC source file: contains declaration and implementation of class lgc::PatchInitializeWorkgroupMemory.
+ * @file  InitializeWorkgroupMemory.cpp
+ * @brief LLPC source file: contains declaration and implementation of class lgc::InitializeWorkgroupMemory.
  ***********************************************************************************************************************
  */
 
-#include "lgc/patch/PatchInitializeWorkgroupMemory.h"
+#include "lgc/patch/InitializeWorkgroupMemory.h"
 #include "lgc/patch/ShaderInputs.h"
 #include "lgc/state/PipelineShaders.h"
 #include "lgc/state/PipelineState.h"
@@ -37,7 +37,7 @@
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/Support/CommandLine.h"
 
-#define DEBUG_TYPE "lgc-patch-initialize-workgroup-memory"
+#define DEBUG_TYPE "lgc-initialize-workgroup-memory"
 
 using namespace lgc;
 using namespace llvm;
@@ -54,7 +54,7 @@ namespace lgc {
 // @param [in/out] module : LLVM module to be run on
 // @param [in/out] analysisManager : Analysis manager to use for this transformation
 // @returns : The preserved analyses (The analyses that are still valid after this pass)
-PreservedAnalyses PatchInitializeWorkgroupMemory::run(Module &module, ModuleAnalysisManager &analysisManager) {
+PreservedAnalyses InitializeWorkgroupMemory::run(Module &module, ModuleAnalysisManager &analysisManager) {
   PipelineState *pipelineState = analysisManager.getResult<PipelineStateWrapper>(module).getPipelineState();
   PipelineShadersResult &pipelineShaders = analysisManager.getResult<PipelineShaders>(module);
 
@@ -120,7 +120,7 @@ PreservedAnalyses PatchInitializeWorkgroupMemory::run(Module &module, ModuleAnal
 //
 // @param lds : The LDS variable to be initialized
 // @param builder : BuilderBase to use for instruction constructing
-void PatchInitializeWorkgroupMemory::initializeWithZero(GlobalVariable *lds, BuilderBase &builder) {
+void InitializeWorkgroupMemory::initializeWithZero(GlobalVariable *lds, BuilderBase &builder) {
   auto entryInsertPos = &*m_entryPoint->front().getFirstNonPHIOrDbgOrAlloca();
   auto originBlock = entryInsertPos->getParent();
   auto endInitBlock = originBlock->splitBasicBlock(entryInsertPos);
@@ -241,7 +241,7 @@ void PatchInitializeWorkgroupMemory::initializeWithZero(GlobalVariable *lds, Bui
 // Return the size in dwords of a variable type
 //
 // @param inputTy : The type to be calculated
-unsigned PatchInitializeWorkgroupMemory::getTypeSizeInDwords(Type *inputTy) {
+unsigned InitializeWorkgroupMemory::getTypeSizeInDwords(Type *inputTy) {
   if (inputTy->isSingleValueType()) {
     // Variable in LDS is stored in dwords and padded as 4 dwords
     unsigned dwordCount = 4;
diff --git a/lgc/patch/LgcLowering.cpp b/lgc/patch/LgcLowering.cpp
index e7b79c7ddd..ef0b4d882f 100644
--- a/lgc/patch/LgcLowering.cpp
+++ b/lgc/patch/LgcLowering.cpp
@@ -46,6 +46,7 @@
 #include "lgc/patch/FragmentColorExport.h"
 #include "lgc/patch/GenerateCopyShader.h"
 #include "lgc/patch/IncludeLlvmIr.h"
+#include "lgc/patch/InitializeWorkgroupMemory.h"
 #include "lgc/patch/LowerBufferOperations.h"
 #include "lgc/patch/LowerDebugPrintf.h"
 #include "lgc/patch/LowerDesc.h"
@@ -58,7 +59,6 @@
 #include "lgc/patch/LowerSubgroupOps.h"
 #include "lgc/patch/MutateEntryPoint.h"
 #include "lgc/patch/PassthroughHullShader.h"
-#include "lgc/patch/PatchInitializeWorkgroupMemory.h"
 #include "lgc/patch/PeepholeOptimization.h"
 #include "lgc/patch/PreparePipelineAbi.h"
 #include "lgc/patch/ScalarizeLoads.h"
@@ -69,12 +69,12 @@
 #if LLPC_BUILD_STRIX1
 #include "lgc/patch/WorkaroundDsSubdwordWrite.h"
 #endif
+#include "lgc/Debug.h"
 #include "lgc/patch/CombineCooperativeMatrix.h"
 #include "lgc/patch/LowerCooperativeMatrix.h"
 #include "lgc/state/AbiMetadata.h"
 #include "lgc/state/PipelineState.h"
 #include "lgc/state/TargetInfo.h"
-#include "lgc/util/Debug.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IRPrinter/IRPrintingPasses.h"
@@ -187,7 +187,7 @@ void Patch::addPasses(PipelineState *pipelineState, lgc::PassManager &passMgr, T
 
   if (pipelineState->hasShaderStage(ShaderStage::Vertex) && !pipelineState->hasShaderStage(ShaderStage::TessControl) &&
       pipelineState->hasShaderStage(ShaderStage::TessEval))
-    passMgr.addPass(TcsPassthroughShader());
+    passMgr.addPass(PassthroughHullShader());
 
   passMgr.addPass(GenerateNullFragmentShader());
   passMgr.addPass(CollectResourceUsage()); // also removes inactive/unused resources
@@ -199,12 +199,12 @@ void Patch::addPasses(PipelineState *pipelineState, lgc::PassManager &passMgr, T
   passMgr.addPass(ApplyWorkarounds());
   passMgr.addPass(GenerateCopyShader());
   passMgr.addPass(LowerVertexFetch());
-  passMgr.addPass(LowerFragColorExport());
+  passMgr.addPass(LowerFragmentColorExport());
   passMgr.addPass(LowerDebugPrintf());
   passMgr.addPass(LowerDesc());
   passMgr.addPass(MutateEntryPoint());
   passMgr.addPass(createModuleToFunctionPassAdaptor(LowerPopsInterlock()));
-  passMgr.addPass(PatchInitializeWorkgroupMemory());
+  passMgr.addPass(InitializeWorkgroupMemory());
   passMgr.addPass(LowerInOut());
 
   // Patch invariant load and loop metadata.
@@ -253,7 +253,7 @@ void Patch::addPasses(PipelineState *pipelineState, lgc::PassManager &passMgr, T
     fpm.addPass(PromotePass());
     fpm.addPass(ADCEPass());
     fpm.addPass(StructurizeBuffers());
-    fpm.addPass(PatchBufferOp());
+    fpm.addPass(LowerBufferOperations());
     fpm.addPass(InstCombinePass());
     fpm.addPass(SimplifyCFGPass());
     passMgr.addPass(createModuleToFunctionPassAdaptor(std::move(fpm)));
@@ -265,7 +265,7 @@ void Patch::addPasses(PipelineState *pipelineState, lgc::PassManager &passMgr, T
   } else {
     FunctionPassManager fpm;
     fpm.addPass(StructurizeBuffers());
-    fpm.addPass(PatchBufferOp());
+    fpm.addPass(LowerBufferOperations());
     fpm.addPass(InstCombinePass());
     passMgr.addPass(createModuleToFunctionPassAdaptor(std::move(fpm)));
   }
@@ -274,9 +274,9 @@ void Patch::addPasses(PipelineState *pipelineState, lgc::PassManager &passMgr, T
 
   // Set up target features in shader entry-points.
   // NOTE: Needs to be done after post-NGG function inlining, because LLVM refuses to inline something
-  // with conflicting attributes. Attributes could conflict on GFX10 because PatchSetupTargetFeatures
+  // with conflicting attributes. Attributes could conflict on GFX10 because SetUpTargetFeatures
   // adds a target feature to determine wave32 or wave64.
-  passMgr.addPass(PatchSetupTargetFeatures());
+  passMgr.addPass(SetUpTargetFeatures());
 
   // Include LLVM IR as a separate section in the ELF binary
   if (pipelineState->getOptions().includeIr)
diff --git a/lgc/patch/LowerBufferOperations.cpp b/lgc/patch/LowerBufferOperations.cpp
index 851714a72e..40c39eb9bb 100644
--- a/lgc/patch/LowerBufferOperations.cpp
+++ b/lgc/patch/LowerBufferOperations.cpp
@@ -25,30 +25,26 @@
 /**
  ***********************************************************************************************************************
  * @file  LowerBufferOperations.cpp
- * @brief LLPC source file: contains implementation of class lgc::PatchBufferOp.
+ * @brief LLPC source file: contains implementation of class lgc::LowerBufferOperations.
  ***********************************************************************************************************************
  */
 #include "lgc/patch/LowerBufferOperations.h"
-#include "lgc/Builder.h"
 #include "lgc/CommonDefs.h"
 #include "lgc/LgcContext.h"
 #include "lgc/LgcDialect.h"
-#include "lgc/builder/BuilderImpl.h"
 #include "lgc/state/IntrinsDefs.h"
 #include "lgc/state/PipelineState.h"
 #include "lgc/state/TargetInfo.h"
 #include "llvm-dialects/Dialect/Visitor.h"
-#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
-#include "llvm/InitializePasses.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
-#define DEBUG_TYPE "lgc-patch-buffer-op"
+#define DEBUG_TYPE "lgc-lower-buffer-operations"
 
 using namespace CompilerUtils;
 using namespace llvm;
@@ -56,8 +52,8 @@ using namespace lgc;
 
 namespace {
 
-struct PatchBufferOpImpl {
-  PatchBufferOpImpl(LLVMContext &context, PipelineState &pipelineState, UniformityInfo &uniformityInfo);
+struct LowerBufferOperationsImpl {
+  LowerBufferOperationsImpl(LLVMContext &context, PipelineState &pipelineState, UniformityInfo &uniformityInfo);
 
   bool run(Function &function);
 
@@ -67,8 +63,8 @@ struct PatchBufferOpImpl {
 
 } // anonymous namespace
 
-LLVM_DIALECTS_VISITOR_PAYLOAD_PROJECT_FIELD(PatchBufferOpImpl, m_typeLowering)
-LLVM_DIALECTS_VISITOR_PAYLOAD_PROJECT_FIELD(PatchBufferOpImpl, m_bufferOpLowering)
+LLVM_DIALECTS_VISITOR_PAYLOAD_PROJECT_FIELD(LowerBufferOperationsImpl, m_typeLowering)
+LLVM_DIALECTS_VISITOR_PAYLOAD_PROJECT_FIELD(LowerBufferOperationsImpl, m_bufferOpLowering)
 
 // =====================================================================================================================
 // Executes this LLVM patching pass on the specified LLVM function.
@@ -76,21 +72,22 @@ LLVM_DIALECTS_VISITOR_PAYLOAD_PROJECT_FIELD(PatchBufferOpImpl, m_bufferOpLowerin
 // @param [in/out] function : LLVM function to be run on
 // @param [in/out] analysisManager : Analysis manager to use for this transformation
 // @returns : The preserved analyses (The analyses that are still valid after this pass)
-PreservedAnalyses PatchBufferOp::run(Function &function, FunctionAnalysisManager &analysisManager) {
+PreservedAnalyses LowerBufferOperations::run(Function &function, FunctionAnalysisManager &analysisManager) {
   const auto &moduleAnalysisManager = analysisManager.getResult<ModuleAnalysisManagerFunctionProxy>(function);
   PipelineState *pipelineState =
       moduleAnalysisManager.getCachedResult<PipelineStateWrapper>(*function.getParent())->getPipelineState();
   UniformityInfo &uniformityInfo = analysisManager.getResult<UniformityInfoAnalysis>(function);
 
-  PatchBufferOpImpl impl(function.getContext(), *pipelineState, uniformityInfo);
+  LowerBufferOperationsImpl impl(function.getContext(), *pipelineState, uniformityInfo);
   if (impl.run(function))
     return PreservedAnalyses::none();
   return PreservedAnalyses::all();
 }
 
 // =====================================================================================================================
-// Construct the per-run temporaries of the PatchBufferOp pass.
-PatchBufferOpImpl::PatchBufferOpImpl(LLVMContext &context, PipelineState &pipelineState, UniformityInfo &uniformityInfo)
+// Construct the per-run temporaries of the LowerBufferOperations pass.
+LowerBufferOperationsImpl::LowerBufferOperationsImpl(LLVMContext &context, PipelineState &pipelineState,
+                                                     UniformityInfo &uniformityInfo)
     : m_typeLowering(context), m_bufferOpLowering(m_typeLowering, pipelineState, uniformityInfo) {
 }
 
@@ -99,10 +96,10 @@ PatchBufferOpImpl::PatchBufferOpImpl(LLVMContext &context, PipelineState &pipeli
 //
 // @param [in,out] function : LLVM function to be run on
 // @returns : True if the module was modified by the transformation and false otherwise
-bool PatchBufferOpImpl::run(Function &function) {
+bool LowerBufferOperationsImpl::run(Function &function) {
   LLVM_DEBUG(dbgs() << "Run the pass Patch-Buffer-Op on: " << function.getName() << '\n');
 
-  static const auto visitor = llvm_dialects::VisitorBuilder<PatchBufferOpImpl>()
+  static const auto visitor = llvm_dialects::VisitorBuilder<LowerBufferOperationsImpl>()
                                   .nest(&BufferOpLowering::registerVisitors)
                                   .nest(&TypeLowering::registerVisitors)
                                   .build();
@@ -126,13 +123,17 @@ static SmallVector<Type *> convertBufferPointer(TypeLowering &typeLowering, Type
     auto &context = type->getContext();
     switch (pointerType->getAddressSpace()) {
     case ADDR_SPACE_BUFFER_FAT_POINTER:
-      types.push_back(FixedVectorType::get(Type::getInt32Ty(context), 4));
+      types.push_back(FixedVectorType::get(Type::getInt32Ty(context), 4)); // the concrete 128-bit descriptor
       types.push_back(PointerType::get(context, ADDR_SPACE_CONST_32BIT));
+      types.push_back(Type::getIntNTy(context, 1)); // whether indexed access is possible
+      types.push_back(Type::getInt32Ty(context));   // the index, if an indexed access is possible; and poison otherwise
       break;
     case ADDR_SPACE_BUFFER_STRIDED_POINTER:
       types.push_back(FixedVectorType::get(Type::getInt32Ty(context), 4));
       types.push_back(PointerType::get(context, ADDR_SPACE_CONST_32BIT));
       types.push_back(Type::getInt32Ty(context));
+      types.push_back(Type::getIntNTy(context, 1)); // whether indexed access is possible
+      types.push_back(Type::getInt32Ty(context));   // the index, if an indexed access is possible; and poison otherwise
       break;
     default:
       break;
@@ -150,7 +151,7 @@ static SmallVector<Type *> convertBufferPointer(TypeLowering &typeLowering, Type
 // @param uniformityInfo : the uniformity analysis result
 BufferOpLowering::BufferOpLowering(TypeLowering &typeLowering, PipelineState &pipelineState,
                                    UniformityInfo &uniformityInfo)
-    : m_typeLowering(typeLowering), m_builder(typeLowering.getContext()), m_pipelineState(pipelineState),
+    : m_typeLowering(typeLowering), m_builder(&pipelineState), m_pipelineState(pipelineState),
       m_uniformityInfo(uniformityInfo) {
   m_typeLowering.addRule(&convertBufferPointer);
 
@@ -684,14 +685,17 @@ void BufferOpLowering::visitBufferAddrToPtr(BufferAddrToPtrOp &op) {
 
   Value *address = m_builder.CreatePtrToInt(op.getAddress(), m_builder.getInt64Ty());
   address = m_builder.CreateBitCast(address, FixedVectorType::get(m_builder.getInt32Ty(), 2));
-  Value *descriptor = createCompactDesc(address, nullptr);
+  Value *descriptor = m_builder.buildBufferCompactDesc(address, nullptr);
 
-  m_typeLowering.replaceInstruction(&op, {descriptor, ConstantPointerNull::get(m_offsetType)});
+  m_typeLowering.replaceInstruction(&op, {descriptor, ConstantPointerNull::get(m_offsetType), m_builder.getFalse(),
+                                          PoisonValue::get(m_builder.getInt32Ty())});
 
   auto &di = m_descriptors[descriptor];
 
   di.divergent = m_uniformityInfo.isDivergent(op.getAddress());
   LLVM_DEBUG(dbgs() << (di.divergent.value() ? "Divergent" : "Uniform") << " descriptor: " << *descriptor << '\n');
+
+  di.globallyCoherent = op.getGloballyCoherent();
 }
 
 // =====================================================================================================================
@@ -702,12 +706,15 @@ void BufferOpLowering::visitBufferDescToPtr(BufferDescToPtrOp &descToPtr) {
   m_builder.SetInsertPoint(&descToPtr);
 
   auto *descriptor = descToPtr.getDesc();
-  m_typeLowering.replaceInstruction(&descToPtr, {descriptor, ConstantPointerNull::get(m_offsetType)});
+  m_typeLowering.replaceInstruction(&descToPtr, {descriptor, ConstantPointerNull::get(m_offsetType),
+                                                 m_builder.getFalse(), PoisonValue::get(m_builder.getInt32Ty())});
 
   auto &di = m_descriptors[descriptor];
 
   di.divergent = m_uniformityInfo.isDivergent(descToPtr.getDesc());
   LLVM_DEBUG(dbgs() << (di.divergent.value() ? "Divergent" : "Uniform") << " descriptor: " << *descriptor << '\n');
+
+  di.globallyCoherent = descToPtr.getGloballyCoherent();
 }
 
 // =====================================================================================================================
@@ -736,7 +743,9 @@ void BufferOpLowering::visitConvertToStridedBufferPointer(ConvertToStridedBuffer
   currentDword3 = m_builder.CreateOr(currentDword3, 0x10000000);
   newDescriptor = m_builder.CreateInsertElement(newDescriptor, currentDword3, 3);
 
-  m_typeLowering.replaceInstruction(&convertToStrided, {newDescriptor, values[1], m_builder.getInt32(0)});
+  m_typeLowering.replaceInstruction(&convertToStrided,
+                                    {newDescriptor, values[1], m_builder.getInt32(0), m_builder.getFalse(),
+                                     PoisonValue::get(m_builder.getInt32Ty())});
 
   DescriptorInfo di = m_descriptors.lookup(oldDescriptor);
   m_descriptors.insert({newDescriptor, di});
@@ -752,12 +761,15 @@ void BufferOpLowering::visitStridedBufferDescToPtr(StridedBufferDescToPtrOp &des
 
   auto *descriptor = descToPtr.getDesc();
   m_typeLowering.replaceInstruction(&descToPtr,
-                                    {descriptor, ConstantPointerNull::get(m_offsetType), m_builder.getInt32(0)});
+                                    {descriptor, ConstantPointerNull::get(m_offsetType), m_builder.getInt32(0),
+                                     m_builder.getFalse(), PoisonValue::get(m_builder.getInt32Ty())});
 
   auto &di = m_descriptors[descriptor];
 
   di.divergent = m_uniformityInfo.isDivergent(descriptor);
   LLVM_DEBUG(dbgs() << (di.divergent.value() ? "Divergent" : "Uniform") << " descriptor: " << *descriptor << '\n');
+
+  di.globallyCoherent = descToPtr.getGloballyCoherent();
 }
 
 // =====================================================================================================================
@@ -769,14 +781,17 @@ void BufferOpLowering::visitStridedBufferAddrAndStrideToPtr(StridedBufferAddrAnd
 
   Value *address = m_builder.CreatePtrToInt(addrAndStrideToPtr.getAddress(), m_builder.getInt64Ty());
   address = m_builder.CreateBitCast(address, FixedVectorType::get(m_builder.getInt32Ty(), 2));
-  Value *bufDesc = createCompactDesc(address, addrAndStrideToPtr.getStride());
+  Value *bufDesc = m_builder.buildBufferCompactDesc(address, addrAndStrideToPtr.getStride());
 
   Constant *const nullPointerOff = ConstantPointerNull::get(m_offsetType);
-  m_typeLowering.replaceInstruction(&addrAndStrideToPtr, {bufDesc, nullPointerOff, m_builder.getInt32(0)});
+  m_typeLowering.replaceInstruction(
+      &addrAndStrideToPtr,
+      {bufDesc, nullPointerOff, m_builder.getInt32(0), m_builder.getFalse(), PoisonValue::get(m_builder.getInt32Ty())});
 
   auto &di = m_descriptors[bufDesc];
 
   di.divergent = m_uniformityInfo.isDivergent(addrAndStrideToPtr.getAddress());
+  di.globallyCoherent = addrAndStrideToPtr.getGloballyCoherent();
 }
 
 // =====================================================================================================================
@@ -786,15 +801,23 @@ void BufferOpLowering::visitStridedBufferAddrAndStrideToPtr(StridedBufferAddrAnd
 void BufferOpLowering::visitBufferLoadDescToPtr(BufferLoadDescToPtrOp &loadDescToPtr) {
   m_builder.SetInsertPoint(&loadDescToPtr);
   bool needLoadDesc = true;
-  Value *descriptor = loadDescToPtr.getDescPtr();
+  // NOTE: Rely on later cleanup passes to handle the case where we create descriptor load instructions that end up
+  // being unnecessary due to indexed loads
+  Value *descriptor =
+      createLoadDesc(loadDescToPtr.getDescPtr(), loadDescToPtr.getForceRawView(), loadDescToPtr.getIsCompact());
   if (needLoadDesc) {
-    descriptor =
-        createLoadDesc(loadDescToPtr.getDescPtr(), loadDescToPtr.getForceRawView(), loadDescToPtr.getIsCompact());
     if (loadDescToPtr.getIsCompact())
-      descriptor = createCompactDesc(descriptor, nullptr);
-  }
+      descriptor = m_builder.buildBufferCompactDesc(descriptor, nullptr);
 
-  m_typeLowering.replaceInstruction(&loadDescToPtr, {descriptor, ConstantPointerNull::get(m_offsetType)});
+    m_typeLowering.replaceInstruction(&loadDescToPtr, {descriptor, ConstantPointerNull::get(m_offsetType),
+                                                       m_builder.getFalse(), PoisonValue::get(m_builder.getInt32Ty())});
+  } else {
+    Value *index = m_builder.CreatePtrToInt(loadDescToPtr.getDescPtr(), m_builder.getInt64Ty());
+    index = m_builder.CreateBitCast(index, FixedVectorType::get(m_builder.getInt32Ty(), 2));
+    index = m_builder.CreateExtractElement(index, m_builder.getInt64(0));
+    m_typeLowering.replaceInstruction(&loadDescToPtr,
+                                      {descriptor, ConstantPointerNull::get(m_offsetType), m_builder.getTrue(), index});
+  }
 
   auto &di = m_descriptors[descriptor];
 
@@ -803,6 +826,8 @@ void BufferOpLowering::visitBufferLoadDescToPtr(BufferLoadDescToPtrOp &loadDescT
 
   di.divergent = m_uniformityInfo.isDivergent(loadSrc);
   LLVM_DEBUG(dbgs() << (di.divergent.value() ? "Divergent" : "Uniform") << " descriptor: " << *descriptor << '\n');
+
+  di.globallyCoherent = loadDescToPtr.getGloballyCoherent();
 }
 
 // =====================================================================================================================
@@ -812,17 +837,20 @@ void BufferOpLowering::visitBufferLoadDescToPtr(BufferLoadDescToPtrOp &loadDescT
 void BufferOpLowering::visitStridedBufferLoadDescToPtr(StridedBufferLoadDescToPtrOp &loadDescToPtr) {
   m_builder.SetInsertPoint(&loadDescToPtr);
   bool needLoadDesc = true;
-  Value *descriptor = loadDescToPtr.getDescPtr();
+  Value *descriptor =
+      createLoadDesc(loadDescToPtr.getDescPtr(), loadDescToPtr.getForceRawView(), loadDescToPtr.getIsCompact());
   if (needLoadDesc) {
-    descriptor =
-        createLoadDesc(loadDescToPtr.getDescPtr(), loadDescToPtr.getForceRawView(), loadDescToPtr.getIsCompact());
-
     if (loadDescToPtr.getIsCompact())
-      descriptor = createCompactDesc(descriptor, loadDescToPtr.getStride());
-  }
+      descriptor = m_builder.buildBufferCompactDesc(descriptor, loadDescToPtr.getStride());
 
-  m_typeLowering.replaceInstruction(&loadDescToPtr,
-                                    {descriptor, ConstantPointerNull::get(m_offsetType), m_builder.getInt32(0)});
+    m_typeLowering.replaceInstruction(&loadDescToPtr,
+                                      {descriptor, ConstantPointerNull::get(m_offsetType), m_builder.getInt32(0),
+                                       m_builder.getFalse(), PoisonValue::get(m_builder.getInt32Ty())});
+  } else {
+    Value *index = m_builder.CreateBitCast(loadDescToPtr.getDescPtr(), m_builder.getInt32Ty());
+    m_typeLowering.replaceInstruction(&loadDescToPtr, {descriptor, ConstantPointerNull::get(m_offsetType),
+                                                       m_builder.getInt32(0), m_builder.getTrue(), index});
+  }
 
   auto &di = m_descriptors[descriptor];
 
@@ -831,6 +859,8 @@ void BufferOpLowering::visitStridedBufferLoadDescToPtr(StridedBufferLoadDescToPt
 
   di.divergent = m_uniformityInfo.isDivergent(loadSrc);
   LLVM_DEBUG(dbgs() << (di.divergent.value() ? "Divergent" : "Uniform") << " descriptor: " << *descriptor << '\n');
+
+  di.globallyCoherent = loadDescToPtr.getGloballyCoherent();
 }
 
 // =====================================================================================================================
@@ -848,10 +878,12 @@ void BufferOpLowering::visitStridedIndexAdd(StridedIndexAddOp &indexAdd) {
 
   // If the old index zero, we can skip the addition and just take the delta index
   // Otherwise, we need to add the delta index to the old one.
-  if (auto oldIndexInt = dyn_cast<ConstantInt>(values[2]); !oldIndexInt || !(oldIndexInt->isZero()))
+  if (auto oldIndexInt = dyn_cast<ConstantInt>(values[2]); !oldIndexInt || !(oldIndexInt->isZero())) {
+    m_builder.SetInsertPoint(&indexAdd);
     deltaIndex = m_builder.CreateAdd(values[2], deltaIndex);
+  }
 
-  m_typeLowering.replaceInstruction(&indexAdd, {values[0], values[1], deltaIndex});
+  m_typeLowering.replaceInstruction(&indexAdd, {values[0], values[1], deltaIndex, values[3], values[4]});
 }
 
 // =====================================================================================================================
@@ -937,9 +969,9 @@ void BufferOpLowering::visitGetElementPtrInst(GetElementPtrInst &getElemPtrInst)
   copyMetadata(newGetElemPtr, &getElemPtrInst);
 
   if (getElemPtrInst.getAddressSpace() == ADDR_SPACE_BUFFER_STRIDED_POINTER)
-    m_typeLowering.replaceInstruction(&getElemPtrInst, {values[0], newGetElemPtr, values[2]});
+    m_typeLowering.replaceInstruction(&getElemPtrInst, {values[0], newGetElemPtr, values[2], values[3], values[4]});
   else
-    m_typeLowering.replaceInstruction(&getElemPtrInst, {values[0], newGetElemPtr});
+    m_typeLowering.replaceInstruction(&getElemPtrInst, {values[0], newGetElemPtr, values[2], values[3]});
 }
 
 // =====================================================================================================================
@@ -1155,11 +1187,10 @@ void BufferOpLowering::visitReadFirstLane(llvm::IntrinsicInst &intrinsic) {
     return;
 
   auto values = m_typeLowering.getValue(intrinsic.getArgOperand(0));
-  Value *desc = values[0];
   Value *ptr = values[1];
   ptr = m_builder.CreateIntrinsic(ptr->getType(), Intrinsic::amdgcn_readfirstlane, ptr);
 
-  m_typeLowering.replaceInstruction(&intrinsic, {desc, ptr});
+  m_typeLowering.replaceInstruction(&intrinsic, {values[0], ptr, values[2], values[3]});
 }
 
 // =====================================================================================================================
@@ -1339,7 +1370,7 @@ void BufferOpLowering::postVisitMemSetInst(MemSetInst &memSetInst) {
     Value *const destPtr = m_builder.CreateGEP(m_builder.getInt8Ty(), dest, index);
     copyMetadata(destPtr, &memSetInst);
 
-    Value *const castDest = m_builder.CreateBitCast(destPtr, castDestType->getPointerTo(destAddrSpace));
+    Value *const castDest = m_builder.CreateBitCast(destPtr, m_builder.getPtrTy(destAddrSpace));
     copyMetadata(castDest, &memSetInst);
 
     // And perform a store for the value at this byte.
@@ -1362,7 +1393,7 @@ void BufferOpLowering::postVisitMemSetInst(MemSetInst &memSetInst) {
       Value *const memoryPointer = m_builder.CreateAlloca(memoryType);
       copyMetadata(memoryPointer, &memSetInst);
 
-      Type *const int8PtrTy = m_builder.getInt8Ty()->getPointerTo(ADDR_SPACE_PRIVATE);
+      Type *const int8PtrTy = m_builder.getPtrTy(ADDR_SPACE_PRIVATE);
       Value *const castMemoryPointer = m_builder.CreateBitCast(memoryPointer, int8PtrTy);
       copyMetadata(castMemoryPointer, &memSetInst);
 
@@ -1413,10 +1444,8 @@ void BufferOpLowering::postVisitLoadTfeOp(LoadTfeOp &loadTfe) {
     bufferLoad = m_builder.CreateIntrinsic(loadTfe.getType(), Intrinsic::amdgcn_struct_buffer_load,
                                            {bufferDesc, index, offset, m_builder.getInt32(0), m_builder.getInt32(0)});
   }
-  if (getDescriptorInfo(bufferDesc).divergent.value()) {
-    BuilderImpl builderImpl(&m_pipelineState);
-    bufferLoad = builderImpl.createWaterfallLoop(bufferLoad, 0, false);
-  }
+  if (getDescriptorInfo(bufferDesc).divergent.value())
+    bufferLoad = m_builder.createWaterfallLoop(bufferLoad, 0, false);
 
   // Record the load instruction so we remember to delete it later.
   m_typeLowering.eraseInstruction(&loadTfe);
@@ -1437,7 +1466,7 @@ Value *BufferOpLowering::getBaseAddressFromBufferDesc(Value *const bufferDesc) {
   Value *const baseAddrMask = ConstantVector::get({m_builder.getInt32(0xFFFFFFFF), m_builder.getInt32(0xFFFF)});
   baseAddr = m_builder.CreateAnd(baseAddr, baseAddrMask);
   baseAddr = m_builder.CreateBitCast(baseAddr, m_builder.getInt64Ty());
-  return m_builder.CreateIntToPtr(baseAddr, m_builder.getInt8Ty()->getPointerTo(ADDR_SPACE_GLOBAL));
+  return m_builder.CreateIntToPtr(baseAddr, m_builder.getPtrTy(ADDR_SPACE_GLOBAL));
 }
 
 // =====================================================================================================================
@@ -1499,9 +1528,17 @@ Value *BufferOpLowering::replaceLoadStore(Instruction &inst) {
 
   m_builder.SetInsertPoint(&inst);
 
+  const bool isStridedPointer =
+      pointerOperand->getType()->getPointerAddressSpace() == ADDR_SPACE_BUFFER_STRIDED_POINTER;
   auto pointerValues = m_typeLowering.getValue(pointerOperand);
-  Value *const bufferDesc = pointerValues[0];
-  const bool isIndexedDesc = isa<PointerType>(bufferDesc->getType());
+  unsigned id = isStridedPointer ? 3 : 2;
+  Value *bufferDesc = pointerValues[0];
+  bool isIndexedDesc = false;
+  if (isa<ConstantInt>(pointerValues[id])) {
+    isIndexedDesc = cast<ConstantInt>(pointerValues[id])->isOne();
+    if (isIndexedDesc)
+      bufferDesc = pointerValues[id + 1];
+  }
 
   const DataLayout &dataLayout = m_builder.GetInsertBlock()->getModule()->getDataLayout();
 
@@ -1514,7 +1551,8 @@ Value *BufferOpLowering::replaceLoadStore(Instruction &inst) {
   }
 
   const bool isNonTemporal = inst.getMetadata(LLVMContext::MD_nontemporal);
-  const bool isGlc = ordering != AtomicOrdering::NotAtomic;
+  const bool isGlc =
+      ordering != AtomicOrdering::NotAtomic || m_descriptors[bufferDesc].globallyCoherent.value_or(false);
   const bool isDlc = isGlc; // For buffer load on GFX10+, we set DLC = GLC
 
   Value *const baseIndex = m_builder.CreatePtrToInt(pointerValues[1], m_builder.getInt32Ty());
@@ -1605,14 +1643,6 @@ Value *BufferOpLowering::replaceLoadStore(Instruction &inst) {
     }
   }
 
-  auto getBufferDesc = [&]() -> Value * {
-    if (isIndexedDesc) {
-      auto address = m_builder.CreatePtrToInt(bufferDesc, m_builder.getInt64Ty());
-      return m_builder.CreateTrunc(address, m_builder.getInt32Ty());
-    }
-    return bufferDesc;
-  };
-
   // The index in storeValue which we use next
   unsigned storeIndex = 0;
 
@@ -1659,6 +1689,7 @@ Value *BufferOpLowering::replaceLoadStore(Instruction &inst) {
         coherent.bits.slc = isNonTemporal;
     }
 
+    Value *indexValue = isStridedPointer ? pointerValues[2] : nullptr;
     if (isLoad) {
       bool accessSizeAllowed = true;
       if (m_pipelineState.getTargetInfo().getGfxIpVersion().major <= 11) {
@@ -1667,9 +1698,6 @@ Value *BufferOpLowering::replaceLoadStore(Instruction &inst) {
         accessSizeAllowed = accessSize >= 4;
       }
 
-      bool isStridedPointer = pointerOperand->getType()->getPointerAddressSpace() == ADDR_SPACE_BUFFER_STRIDED_POINTER;
-      Value *indexValue = isStridedPointer ? pointerValues[2] : nullptr;
-
 #if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 458033
       // Old version of the code
       const bool isDivergentPtr = m_uniformityInfo.isDivergent(*pointerOperand);
@@ -1713,14 +1741,14 @@ Value *BufferOpLowering::replaceLoadStore(Instruction &inst) {
 #endif
           part = m_builder.CreateIntrinsic(
               intAccessType, intrinsic,
-              {getBufferDesc(), indexValue, offsetVal, m_builder.getInt32(0), m_builder.getInt32(coherent.u32All)});
+              {bufferDesc, indexValue, offsetVal, m_builder.getInt32(0), m_builder.getInt32(coherent.u32All)});
         } else {
           unsigned intrinsicID = Intrinsic::amdgcn_raw_buffer_load;
           if (ordering != AtomicOrdering::NotAtomic)
             intrinsicID = Intrinsic::amdgcn_raw_atomic_buffer_load;
           part = m_builder.CreateIntrinsic(
               intAccessType, intrinsicID,
-              {getBufferDesc(), offsetVal, m_builder.getInt32(0), m_builder.getInt32(coherent.u32All)});
+              {bufferDesc, offsetVal, m_builder.getInt32(0), m_builder.getInt32(coherent.u32All)});
         }
       }
     } else {
@@ -1734,14 +1762,14 @@ Value *BufferOpLowering::replaceLoadStore(Instruction &inst) {
       }
       part = m_builder.CreateBitCast(part, intAccessType);
       copyMetadata(part, &inst);
-      if (pointerOperand->getType()->getPointerAddressSpace() == ADDR_SPACE_BUFFER_STRIDED_POINTER) {
-        part = m_builder.CreateIntrinsic(m_builder.getVoidTy(), Intrinsic::amdgcn_struct_buffer_store,
-                                         {part, getBufferDesc(), pointerValues[2], offsetVal, m_builder.getInt32(0),
-                                          m_builder.getInt32(coherent.u32All)});
+      if (isStridedPointer) {
+        part = m_builder.CreateIntrinsic(
+            m_builder.getVoidTy(), Intrinsic::amdgcn_struct_buffer_store,
+            {part, bufferDesc, indexValue, offsetVal, m_builder.getInt32(0), m_builder.getInt32(coherent.u32All)});
       } else {
         part = m_builder.CreateIntrinsic(
             m_builder.getVoidTy(), Intrinsic::amdgcn_raw_buffer_store,
-            {part, getBufferDesc(), offsetVal, m_builder.getInt32(0), m_builder.getInt32(coherent.u32All)});
+            {part, bufferDesc, offsetVal, m_builder.getInt32(0), m_builder.getInt32(coherent.u32All)});
       }
     }
 
@@ -1871,12 +1899,21 @@ Value *BufferOpLowering::createGlobalPointerAccess(Value *const bufferDesc, Valu
   Value *newOffset = offset;
 
   // index is for strided load which we need to handle the stride of the SRD.
-  if (strideIndex) {
+  if (strideIndex || m_pipelineState.getOptions().checkRawBufferAccessDescStride) {
     Value *desc1 = m_builder.CreateExtractElement(bufferDesc, 1);
     Value *stride =
         m_builder.CreateAnd(m_builder.CreateLShr(desc1, m_builder.getInt32(16)), m_builder.getInt32(0x3fff));
-    bound = m_builder.CreateMul(bound, stride);
-    newOffset = m_builder.CreateAdd(m_builder.CreateMul(strideIndex, stride), newOffset);
+    Value *byteBound = m_builder.CreateMul(bound, stride);
+
+    if (strideIndex) {
+      bound = byteBound;
+      newOffset = m_builder.CreateAdd(m_builder.CreateMul(strideIndex, stride), newOffset);
+    } else {
+      // It is not a strided load, but it is possible that the application/client binds a strided descriptor so if
+      // the stride is not zero, use bound in bytes to avoid wrong OOB check.
+      stride = m_builder.CreateICmpNE(stride, m_builder.getInt32(0));
+      bound = m_builder.CreateSelect(stride, byteBound, bound);
+    }
   }
 
   Value *inBound = m_builder.CreateICmpULT(newOffset, bound);
@@ -1908,7 +1945,7 @@ Value *BufferOpLowering::createGlobalPointerAccess(Value *const bufferDesc, Valu
 
   // Add on the index to the address.
   Value *pointer = m_builder.CreateGEP(m_builder.getInt8Ty(), baseAddr, newOffset);
-  pointer = m_builder.CreateBitCast(pointer, type->getPointerTo(ADDR_SPACE_GLOBAL));
+  pointer = m_builder.CreateBitCast(pointer, m_builder.getPtrTy(ADDR_SPACE_GLOBAL));
   Value *newValue = callback(pointer);
 
   // Store inst doesn't need return a value from a phi node
@@ -1927,55 +1964,6 @@ Value *BufferOpLowering::createGlobalPointerAccess(Value *const bufferDesc, Valu
   return nullptr;
 }
 
-// =====================================================================================================================
-// Create a normal buffer descriptor
-//
-// @param buffAddress : The buffer address
-// @param stride : The stride for strided buffer
-Value *BufferOpLowering::createCompactDesc(Value *buffAddress, Value *stride) {
-  // Extract compact buffer descriptor
-  Value *addrLo = m_builder.CreateExtractElement(buffAddress, uint64_t(0));
-  Value *addrHi = m_builder.CreateExtractElement(buffAddress, 1);
-
-  // Build normal buffer descriptor
-  // Dword 0
-  Value *bufDesc = PoisonValue::get(FixedVectorType::get(m_builder.getInt32Ty(), 4));
-  bufDesc = m_builder.CreateInsertElement(bufDesc, addrLo, uint64_t(0));
-
-  // Dword 1
-  if (stride)
-    addrHi = m_builder.CreateOr(addrHi, m_builder.CreateShl(stride, 16));
-  bufDesc = m_builder.CreateInsertElement(bufDesc, addrHi, 1);
-
-  // Dword 2
-  SqBufRsrcWord2 sqBufRsrcWord2 = {};
-  sqBufRsrcWord2.bits.numRecords = UINT32_MAX;
-  bufDesc = m_builder.CreateInsertElement(bufDesc, m_builder.getInt32(sqBufRsrcWord2.u32All), 2);
-
-  // Dword 3
-  SqBufRsrcWord3 sqBufRsrcWord3 = {};
-  sqBufRsrcWord3.bits.dstSelX = BUF_DST_SEL_X;
-  sqBufRsrcWord3.bits.dstSelY = BUF_DST_SEL_Y;
-  sqBufRsrcWord3.bits.dstSelZ = BUF_DST_SEL_Z;
-  sqBufRsrcWord3.bits.dstSelW = BUF_DST_SEL_W;
-
-  auto gfxIp = m_pipelineState.getTargetInfo().getGfxIpVersion();
-  if (gfxIp.major == 10) {
-    sqBufRsrcWord3.gfx10.format = BUF_FORMAT_32_UINT;
-    sqBufRsrcWord3.gfx10.resourceLevel = 1;
-    sqBufRsrcWord3.gfx10.oobSelect = 2;
-    assert(sqBufRsrcWord3.u32All == 0x21014FAC);
-  } else if (gfxIp.major >= 11) {
-    sqBufRsrcWord3.gfx11.format = BUF_FORMAT_32_UINT;
-    sqBufRsrcWord3.gfx11.oobSelect = 2;
-    assert(sqBufRsrcWord3.u32All == 0x20014FAC);
-  } else {
-    llvm_unreachable("Not implemented!");
-  }
-  bufDesc = m_builder.CreateInsertElement(bufDesc, m_builder.getInt32(sqBufRsrcWord3.u32All), 3);
-  return bufDesc;
-}
-
 // =====================================================================================================================
 // Create a load from the given buffer address
 //
diff --git a/lgc/patch/LowerCooperativeMatrix.cpp b/lgc/patch/LowerCooperativeMatrix.cpp
index bf1b9db659..63bb650ae5 100644
--- a/lgc/patch/LowerCooperativeMatrix.cpp
+++ b/lgc/patch/LowerCooperativeMatrix.cpp
@@ -51,7 +51,8 @@ namespace lgc {
 static const Intrinsic::AMDGCNIntrinsics InvalidInstricID = Intrinsic::AMDGCNIntrinsics(0xFFFFFFFF);
 static const Intrinsic::AMDGCNIntrinsics GetWmmaIntrinsic(GfxIpVersion gfxIp, CooperativeMatrixElementType typeA,
                                                           CooperativeMatrixElementType typeB,
-                                                          CooperativeMatrixElementType typeC, bool isTiled = false) {
+                                                          CooperativeMatrixElementType typeC, unsigned kMultiplier,
+                                                          bool isTiled = false) {
   assert(gfxIp.major >= 11);
   switch (typeA) {
   case CooperativeMatrixElementType::Float16: {
@@ -77,8 +78,11 @@ static const Intrinsic::AMDGCNIntrinsics GetWmmaIntrinsic(GfxIpVersion gfxIp, Co
   }
   case CooperativeMatrixElementType::Int4: {
     assert(typeA == typeB);
-    if (typeC == CooperativeMatrixElementType::Int32)
-      return Intrinsic::amdgcn_wmma_i32_16x16x16_iu4;
+    if (typeC == CooperativeMatrixElementType::Int32) {
+      if (kMultiplier == 1)
+        return Intrinsic::amdgcn_wmma_i32_16x16x16_iu4;
+    }
+
     break;
   }
   default:
@@ -149,9 +153,11 @@ void LowerCooperativeMatrix::processCoopMatrixFunction(Module &module) {
 //
 // @param elemType : the matrix element type
 // @param layout : the matrix layout
+// @param kSize : the matrix K size
 // @returns : the type properties
 LowerCooperativeMatrix::TypeProperties LowerCooperativeMatrix::getTypeProperties(CooperativeMatrixElementType elemType,
-                                                                                 CooperativeMatrixLayout layout) const {
+                                                                                 CooperativeMatrixLayout layout,
+                                                                                 unsigned kSize) const {
   TypeProperties props;
 
   props.matrixElementStride = 1;
@@ -214,10 +220,11 @@ LowerCooperativeMatrix::TypeProperties LowerCooperativeMatrix::getTypeProperties
 // @param vecValue : Vector Value which maybe V16.
 // @param elemType : Element type for the matrix.
 // @param layout : Identify whether this matrix is A/B or C/D
+// @param kSize : the matrix K size
 Value *LowerCooperativeMatrix::convFlatVecToCoopMatrixVec(BuilderCommon &builder, Value *vecValue,
                                                           CooperativeMatrixElementType elemType,
-                                                          CooperativeMatrixLayout layout) {
-  auto props = getTypeProperties(elemType, layout);
+                                                          CooperativeMatrixLayout layout, unsigned kSize) {
+  auto props = getTypeProperties(elemType, layout, kSize);
 
   if (props.numMatrixElements > props.numFlatElements) {
     SmallVector<int, 16> mask;
@@ -241,10 +248,11 @@ Value *LowerCooperativeMatrix::convFlatVecToCoopMatrixVec(BuilderCommon &builder
 // @param matrixValue : Vector Value which maybe V16.
 // @param elemType : Element type for the matrix.
 // @param layout : Identify whether this matrix is A/B or C/D
+// @param kSize : the matrix K size
 Value *LowerCooperativeMatrix::convCoopMatrixVecToFlatVec(BuilderCommon &builder, Value *matrixValue,
                                                           CooperativeMatrixElementType elemType,
-                                                          CooperativeMatrixLayout layout) {
-  auto props = getTypeProperties(elemType, layout);
+                                                          CooperativeMatrixLayout layout, unsigned kSize) {
+  auto props = getTypeProperties(elemType, layout, kSize);
   Type *elemTy = builder.transCooperativeMatrixElementType(elemType);
   if (elemTy->getScalarSizeInBits() < 8)
     elemTy = builder.getInt8Ty();
@@ -269,12 +277,10 @@ Value *LowerCooperativeMatrix::convCoopMatrixVecToFlatVec(BuilderCommon &builder
 // @param stride : The stride in bytes in memory between the first elements of consecutive rows (orcolumns) in the
 // source data. Guaranteed to be a multiple of the matrix element size.
 // @param isColMajor : Identify the order for the data stored in memory, col-major/row-major
-// @param isFromPackedVal : Whether the loaded value is in a packed 8-bit format
 // @param insertPos : Where to insert the instruction
 LowerCooperativeMatrix::ComputeAddressInfo
 LowerCooperativeMatrix::computeAddressing(CooperativeMatrixLayout layout, CooperativeMatrixElementType elemType,
-                                          int waveSize, Value *stride, bool isColMajor, bool isFromPackedVal,
-                                          Instruction *insertPos) {
+                                          int waveSize, Value *stride, bool isColMajor, Instruction *insertPos) {
   BuilderBase builder(*m_context);
   builder.SetInsertPoint(insertPos);
   Value *threadId = getLaneNumber(builder);
@@ -310,6 +316,8 @@ LowerCooperativeMatrix::computeAddressing(CooperativeMatrixLayout layout, Cooper
   if (isColMajor) {
     addrInfo.base = builder.CreateAdd(rowOffsetInFirstVgpr, builder.CreateMul(colOffsetPerLane, stride));
   } else {
+    if (elemType == CooperativeMatrixElementType::Int4)
+      colOffsetPerLane = builder.CreateLShr(colOffsetPerLane, builder.getInt32(1)); // threadId%16 / 2
     addrInfo.base = builder.CreateAdd(builder.CreateMul(rowOffsetInFirstVgpr, stride), colOffsetPerLane);
     addrInfo.macroStep = builder.CreateMul(addrInfo.macroStep, stride);
     addrInfo.microStep = builder.CreateMul(addrInfo.microStep, stride);
@@ -317,28 +325,15 @@ LowerCooperativeMatrix::computeAddressing(CooperativeMatrixLayout layout, Cooper
 
   // Update address info for a packed 8-bit format in row major in the view of VGPRs layout
   if (!isColMajor) {
-    bool isToPackedVal = isa<CooperativeMatrixStoreOp>(insertPos) && (elemType == CooperativeMatrixElementType::Int4);
-    SmallVector<Value *> nextLaneRes;
-    if (isFromPackedVal || isToPackedVal) {
-      if (layout != CooperativeMatrixLayout::FactorMatrixLayout) {
-        llvm_unreachable("This layout is not supported now.");
-      }
-    }
-    if (isFromPackedVal) {
-      Value *baseOffset = builder.CreateMul(stride, builder.getInt32(8));
-      Value *isLessEight = builder.CreateICmpSLT(colOffsetPerLane, builder.getInt32(8));
-      addrInfo.base = builder.CreateSRem(threadId, builder.getInt32(8));
-      if (elemType == CooperativeMatrixElementType::Int4) {
-        addrInfo.base = builder.CreateSelect(isLessEight, addrInfo.base, builder.CreateAdd(addrInfo.base, baseOffset));
-      } else {
-        addrInfo.base = builder.CreateMul(addrInfo.base, builder.getInt32(2));
-        addrInfo.base = builder.CreateSelect(isLessEight, addrInfo.base, builder.CreateAdd(addrInfo.base, baseOffset));
-        addrInfo.packOffset = builder.getInt32(1);
-      }
-    } else if (isToPackedVal) {
-      addrInfo.base = builder.CreateUDiv(addrInfo.base, builder.getInt32(2));
+    bool isStoringPackedVal =
+        isa<CooperativeMatrixStoreOp>(insertPos) && (elemType == CooperativeMatrixElementType::Int4);
+    if (isStoringPackedVal) {
+      // The i4 value from two threads are merged into two i4vec2. The first i4vec2 is stored by the current thread and
+      // the second is stored by the next thread.
+      Value *offset = builder.CreateMul(builder.CreateSRem(threadId, builder.getInt32(2)), stride);
+      addrInfo.base = builder.CreateAdd(addrInfo.base, offset);
+      // The step is doubled since the next row will be written by the next thread
       addrInfo.macroStep = builder.CreateMul(addrInfo.macroStep, builder.getInt32(2));
-      addrInfo.packOffset = builder.CreateMul(builder.CreateSRem(threadId, builder.getInt32(2)), stride);
     }
   }
 
@@ -354,6 +349,8 @@ void LowerCooperativeMatrix::visitCooperativeMatrixLengthOp(CooperativeMatrixLen
   builder.SetInsertPoint(&matrixlength);
   auto waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage.value());
   auto layout = matrixlength.getLayout();
+  unsigned kSize = matrixlength.getKSize();
+  (void)kSize;
   unsigned length = 0;
   switch (layout) {
   case CooperativeMatrixLayout::FactorMatrixLayout:
@@ -393,6 +390,7 @@ void LowerCooperativeMatrix::visitCooperativeMatrixLoadOp(CooperativeMatrixLoadO
   auto layout = load.getLayout();
   auto isColMajor = load.getColMajor();
   auto alignment = load.getAlignment();
+  unsigned kSize = load.getKSize();
 
   // Calc element offset in memory
   Type *elemTy = builder.transCooperativeMatrixElementType(elemType);
@@ -409,12 +407,17 @@ void LowerCooperativeMatrix::visitCooperativeMatrixLoadOp(CooperativeMatrixLoadO
   bool isCoherent = memoryAccess & (unsigned)(CooperativeMatrixMemoryAccess::MemoryAccessCoherentMask);
   bool isTemporal = memoryAccess & (unsigned)(CooperativeMatrixMemoryAccess::MemoryAccessTemporalMask);
 
-  auto props = getTypeProperties(elemType, layout);
+  auto props = getTypeProperties(elemType, layout, kSize);
 
   bool isLoadingPackedVal = !isColMajor && elemType == CooperativeMatrixElementType::Int4;
-  auto addrInfo = computeAddressing(layout, elemType, waveSize, stride, isColMajor, isLoadingPackedVal, &load);
+  auto addrInfo = computeAddressing(layout, elemType, waveSize, stride, isColMajor, &load);
   Value *vecVal = PoisonValue::get(FixedVectorType::get(elemTy, props.numFlatElements));
-  for (unsigned idx = 0; idx < props.numFlatElements; ++idx) {
+  unsigned numFlatElements = props.numFlatElements;
+
+  SmallVector<Value *> packedVals;
+  if (isLoadingPackedVal)
+    numFlatElements *= 2;
+  for (unsigned idx = 0; idx < numFlatElements; ++idx) {
     Value *macroOffset = builder.CreateMul(addrInfo.macroStep, builder.getInt32(idx / addrInfo.microCount));
     Value *microOffset = builder.CreateMul(addrInfo.microStep, builder.getInt32(idx % addrInfo.microCount));
     Value *offsetInRowCol = builder.CreateAdd(macroOffset, microOffset);
@@ -430,15 +433,39 @@ void LowerCooperativeMatrix::visitCooperativeMatrixLoadOp(CooperativeMatrixLoadO
     } else {
       // For rowMajor@B/C and colMajor@A, as the elements of one lane aren't continuous, no alignments needed.
       eleVal = builder.CreateLoad(elemTy, elePtr, isVolatile);
+      packedVals.push_back(eleVal);
     }
     if (isCoherent && !(addrSpace == ADDR_SPACE_LOCAL && dataBitwidth < 32))
       cast<LoadInst>(eleVal)->setAtomic(AtomicOrdering::Unordered);
     if (isTemporal)
       cast<LoadInst>(eleVal)->setMetadata(LLVMContext::MD_nontemporal, MDNode::get(builder.getContext(), {}));
-    vecVal = builder.CreateInsertElement(vecVal, eleVal, idx);
+    if (!isLoadingPackedVal)
+      vecVal = builder.CreateInsertElement(vecVal, eleVal, idx);
+  }
+  if (isLoadingPackedVal) {
+    // The low 4-bits in the loaded elements write into a VGPR in the current even thread and the high 4-bits in the
+    // loaded elements written into the odd thread
+    Value *threadId = getLaneNumber(builder);
+    Value *isEvenTid = builder.CreateICmpEQ(builder.CreateAnd(threadId, builder.getInt32(1)), builder.getInt32(0));
+    for (unsigned idx = 0; idx < numFlatElements; idx += 2) {
+      Value *lowBits = builder.CreateAnd(packedVals[idx], builder.getInt8(0xf));
+      Value *nextLowBits = builder.CreateAnd(packedVals[idx + 1], builder.getInt8(0xf));
+      nextLowBits = builder.CreateShl(nextLowBits, builder.getInt8(4));
+      Value *evenVal = builder.CreateOr(lowBits, nextLowBits);
+
+      Value *highBits = builder.CreateAnd(packedVals[idx], builder.getInt8(0xf0));
+      highBits = builder.CreateLShr(packedVals[idx], builder.getInt8(4));
+      Value *nextHighBits = builder.CreateAnd(packedVals[idx + 1], builder.getInt8(0xf0));
+      Value *oddVal = builder.CreateOr(highBits, nextHighBits);
+
+      const unsigned elemIdx = idx / 2;
+      Value *evenVec = builder.CreateInsertElement(vecVal, evenVal, elemIdx);
+      Value *oddVec = builder.CreateInsertElement(vecVal, oddVal, elemIdx);
+      vecVal = builder.CreateSelect(isEvenTid, evenVec, oddVec);
+    }
   }
 
-  Value *coMatrix = convFlatVecToCoopMatrixVec(builder, vecVal, elemType, layout);
+  Value *coMatrix = convFlatVecToCoopMatrixVec(builder, vecVal, elemType, layout, kSize);
   m_coopMatrixCalls.push_back(&load);
   load.replaceAllUsesWith(coMatrix);
 
@@ -462,6 +489,7 @@ void LowerCooperativeMatrix::visitCooperativeMatrixStoreOp(CooperativeMatrixStor
   auto isColMajor = store.getColMajor();
   auto alignment = store.getAlignment();
   Value *vecVal = store.getStoreValue();
+  unsigned kSize = store.getKSize();
   auto shaderStage = getShaderStage(builder.GetInsertBlock()->getParent());
   auto waveSize = m_pipelineState->getShaderWaveSize(shaderStage.value());
   assert(waveSize == 32 || waveSize == 64);
@@ -482,17 +510,14 @@ void LowerCooperativeMatrix::visitCooperativeMatrixStoreOp(CooperativeMatrixStor
   bool isCoherent = memoryAccess & (unsigned)(CooperativeMatrixMemoryAccess::MemoryAccessCoherentMask);
   bool isTemporal = memoryAccess & (unsigned)(CooperativeMatrixMemoryAccess::MemoryAccessTemporalMask);
 
-  auto props = getTypeProperties(elemType, layout);
-  bool isFromPackedVal = m_valPackedInMatrixes.find(vecVal) != m_valPackedInMatrixes.end();
+  auto props = getTypeProperties(elemType, layout, kSize);
 
-  auto addrInfo = computeAddressing(layout, elemType, waveSize, stride, isColMajor, isFromPackedVal, &store);
+  auto addrInfo = computeAddressing(layout, elemType, waveSize, stride, isColMajor, &store);
 
-  bool isToPackedVal = !isColMajor && (elemType == CooperativeMatrixElementType::Int4);
-  bool isFromPackedToNormal = !isColMajor && isFromPackedVal && !isToPackedVal;
-  bool isFromNormalToPacked = !isColMajor && !isFromPackedVal && isToPackedVal;
+  bool isStoringPackedVal = !isColMajor && (elemType == CooperativeMatrixElementType::Int4);
   SmallVector<Value *> nextLaneRes;
-  Value *threadId = isFromNormalToPacked ? getLaneNumber(builder) : nullptr;
-  if (isToPackedVal) {
+  Value *threadId = isStoringPackedVal ? getLaneNumber(builder) : nullptr;
+  if (isStoringPackedVal) {
     // The being store value is packed from part of 8-bit values of the adjacent threads. We use permlane16 to get the
     // value from the adjacent thread.
     const unsigned lowSel = 0x67452301;
@@ -506,22 +531,14 @@ void LowerCooperativeMatrix::visitCooperativeMatrixStoreOp(CooperativeMatrixStor
       nextLaneRes.push_back(permLaneX16);
     }
   }
-  vecVal = convCoopMatrixVecToFlatVec(builder, vecVal, elemType, layout);
+  vecVal = convCoopMatrixVecToFlatVec(builder, vecVal, elemType, layout, kSize);
 
   for (unsigned idx = 0; idx < props.numFlatElements; ++idx) {
-    unsigned index = idx;
-    if (isFromPackedToNormal)
-      index = idx / 2;
-
-    Value *macroOffset = builder.CreateMul(addrInfo.macroStep, builder.getInt32(index / addrInfo.microCount));
-    Value *microOffset = builder.CreateMul(addrInfo.microStep, builder.getInt32(index % addrInfo.microCount));
+    Value *macroOffset = builder.CreateMul(addrInfo.macroStep, builder.getInt32(idx / addrInfo.microCount));
+    Value *microOffset = builder.CreateMul(addrInfo.microStep, builder.getInt32(idx % addrInfo.microCount));
     Value *offsetInRowCol = builder.CreateAdd(macroOffset, microOffset);
     Value *offsetInMatrix = builder.CreateAdd(addrInfo.base, offsetInRowCol);
 
-    bool isOddIdx = (idx & 1) == 1;
-    if (isFromNormalToPacked || (isFromPackedToNormal && isOddIdx))
-      offsetInMatrix = builder.CreateAdd(offsetInMatrix, addrInfo.packOffset);
-
     Value *elePtr = builder.CreateGEP(elemTy, dataPtr, offsetInMatrix);
     Value *oneElement = builder.CreateExtractElement(vecVal, idx);
     StoreInst *st = nullptr;
@@ -531,14 +548,15 @@ void LowerCooperativeMatrix::visitCooperativeMatrixStoreOp(CooperativeMatrixStor
       Align compAlignment = commonAlignment(Align(alignment), constantOffsetInRowCol);
       st = builder.CreateAlignedStore(oneElement, elePtr, compAlignment, isVolatile);
     } else {
-      if (isFromNormalToPacked) {
+      if (isStoringPackedVal) {
         Value *adjacentElem = builder.CreateExtractElement(nextLaneRes[idx / 4], idx % 4);
         Value *evenTid = builder.CreateICmpEQ(builder.CreateAnd(threadId, builder.getInt32(1)), builder.getInt32(0));
         Value *mask = builder.CreateSelect(evenTid, builder.getInt8(0xF), builder.getInt8(0xF0));
         oneElement = builder.CreateAnd(oneElement, mask);
         adjacentElem = builder.CreateAnd(adjacentElem, mask);
-        adjacentElem = builder.CreateSelect(evenTid, builder.CreateShl(adjacentElem, builder.getInt8(4)),
-                                            builder.CreateLShr(adjacentElem, builder.getInt8(4)));
+        Value *evenElem = builder.CreateShl(adjacentElem, builder.getInt8(4));
+        Value *oddElem = builder.CreateLShr(adjacentElem, builder.getInt8(4));
+        adjacentElem = builder.CreateSelect(evenTid, evenElem, oddElem);
         oneElement = builder.CreateOr(oneElement, adjacentElem);
       }
 
@@ -564,14 +582,15 @@ void LowerCooperativeMatrix::visitCooperativeMatrixFillOp(CooperativeMatrixFillO
   auto elemType = fill.getElemType();
   auto layout = fill.getLayout();
   Value *value = fill.getScalar();
-  auto props = getTypeProperties(elemType, layout);
+  unsigned kSize = fill.getKSize();
+  auto props = getTypeProperties(elemType, layout, kSize);
   Type *flatType = FixedVectorType::get(builder.transCooperativeMatrixElementType(elemType), props.numMatrixElements);
 
   Value *vec = PoisonValue::get(flatType);
   for (unsigned idx = 0; idx < props.numMatrixElements; idx++)
     vec = builder.CreateInsertElement(vec, value, idx);
 
-  Value *fillValue = convFlatVecToCoopMatrixVec(builder, vec, elemType, layout);
+  Value *fillValue = convFlatVecToCoopMatrixVec(builder, vec, elemType, layout, kSize);
 
   m_coopMatrixCalls.push_back(&fill);
   fill.replaceAllUsesWith(fillValue);
@@ -1587,10 +1606,12 @@ void LowerCooperativeMatrix::visitCooperativeMatrixMulAddOp(CooperativeMatrixMul
   auto matrixAType = muladd.getMatrixAElemType();
   auto matrixBType = muladd.getMatrixBElemType();
   auto matrixCType = muladd.getMatrixCElemType();
+  [[maybe_unused]] auto matrixDType = muladd.getMatrixDElemType();
   bool isSignedA = muladd.getIsSignedA();
   bool isSignedB = muladd.getIsSignedB();
   bool isSatOrOpsel = muladd.getIsSatOrOpsel();
   StringRef instName = muladd.getName();
+  unsigned kMultiplier = muladd.getKMultiplier();
 
   // Gfx11:
   // wave64:
@@ -1618,7 +1639,8 @@ void LowerCooperativeMatrix::visitCooperativeMatrixMulAddOp(CooperativeMatrixMul
 
     if (BuilderCommon::isTypeNCooperativeMatrix(matrixAType, 16)) {
       assert(matrixAType == matrixBType);
-      { factorFlatElemNum = 16; }
+      if (m_gfxIp.major <= 11)
+        factorFlatElemNum = 16;
       Type *factorType =
           FixedVectorType::get(builder.transCooperativeMatrixElementType(matrixAType), factorFlatElemNum);
       matrixA = builder.CreateBitCast(matrixA, factorType);
@@ -1633,15 +1655,13 @@ void LowerCooperativeMatrix::visitCooperativeMatrixMulAddOp(CooperativeMatrixMul
         matrixC = waveSize == 64 ? builder.CreateShuffleVector(matrixC, ArrayRef<int>({0, 1, 2, 3}), "shuffleVector")
                                  : matrixC;
     } else if (BuilderCommon::isTypeNCooperativeMatrix(matrixCType, 16)) {
-      {
-        if (m_gfxIp.major == 12) {
-          // When gfxIp.major > 12, waveSize will always be 32 then matrixC size is solid without any necessary swizzle.
-          matrixC =
-              waveSize == 64 ? builder.CreateShuffleVector(matrixC, ArrayRef<int>({0, 1}), "shuffleVector") : matrixC;
-        } else { // m_gfxIp.major <= 11
-          matrixC = waveSize == 64 ? builder.CreateShuffleVector(matrixC, ArrayRef<int>({0, 1, 2, 3}), "shuffleVector")
-                                   : matrixC;
-        }
+      if (m_gfxIp.major == 12) {
+        // When gfxIp.major > 12, waveSize will always be 32 then matrixC size is solid without any necessary swizzle.
+        matrixC =
+            waveSize == 64 ? builder.CreateShuffleVector(matrixC, ArrayRef<int>({0, 1}), "shuffleVector") : matrixC;
+      } else if (m_gfxIp.major < 12) { // m_gfxIp.major <= 11
+        matrixC = waveSize == 64 ? builder.CreateShuffleVector(matrixC, ArrayRef<int>({0, 1, 2, 3}), "shuffleVector")
+                                 : matrixC;
       }
 
       Type *castType = nullptr;
@@ -1658,7 +1678,7 @@ void LowerCooperativeMatrix::visitCooperativeMatrixMulAddOp(CooperativeMatrixMul
     }
 
     Intrinsic::AMDGCNIntrinsics intrinsic = InvalidInstricID;
-    intrinsic = GetWmmaIntrinsic(m_gfxIp, matrixAType, matrixBType, matrixCType, muladd.getIsTied());
+    intrinsic = GetWmmaIntrinsic(m_gfxIp, matrixAType, matrixBType, matrixCType, kMultiplier, muladd.getIsTied());
 
     if (intrinsic == InvalidInstricID)
       llvm_unreachable("HW intrinsics not supported!");
@@ -1700,7 +1720,13 @@ void LowerCooperativeMatrix::visitCooperativeMatrixMulAddOp(CooperativeMatrixMul
       llvm_unreachable("Should never be called!");
       break;
     }
-    matrixD = builder.CreateIntrinsic(matrixC->getType(), intrinsic, args, nullptr, instName);
+    auto retTy = matrixC->getType();
+    if (matrixCType != matrixDType) {
+      assert(matrixDType == CooperativeMatrixElementType::Float32 &&
+             matrixCType == CooperativeMatrixElementType::Int32);
+      retTy = muladd.getResult()->getType();
+    }
+    matrixD = builder.CreateIntrinsic(retTy, intrinsic, args, nullptr, instName);
 
     if (BuilderCommon::isTypeNCooperativeMatrix(matrixCType, 16)) {
       unsigned coopVeclength = cast<FixedVectorType>(matrixD->getType())->getNumElements();
@@ -2276,7 +2302,7 @@ void LowerCooperativeMatrix::visitCooperativeRowAccExpandOp(CooperativeRowAccExp
     assert(rowAccElemType == matrixElemType);
 
   assert(matrixLayout == CooperativeMatrixLayout::AccumulatorMatrixLayout);
-  auto props = getTypeProperties(matrixElemType, matrixLayout);
+  auto props = getTypeProperties(matrixElemType, matrixLayout, 16);
   Type *flatType =
       FixedVectorType::get(builder.transCooperativeMatrixElementType(matrixElemType), props.numFlatElements);
   Value *flatVec = PoisonValue::get(flatType);
diff --git a/lgc/patch/LowerGpuRt.cpp b/lgc/patch/LowerGpuRt.cpp
index 52e17f61a1..b0a47ba2cb 100644
--- a/lgc/patch/LowerGpuRt.cpp
+++ b/lgc/patch/LowerGpuRt.cpp
@@ -325,15 +325,6 @@ void LowerGpuRt::visitLdsStackInit(GpurtLdsStackInitOp &inst) {
 void LowerGpuRt::visitFloatWithRoundMode(lgc::GpurtFloatWithRoundModeOp &inst) {
   m_builder->SetInsertPoint(&inst);
 
-  // Use setReg to set SQ_WAVE_MODE.
-  // hwRegId : SQ related register index.
-  // Offset : register field offset.
-  // Width  : field width.
-  // hwReg : (hwRegId | (Offset << 6) | ((Width - 1) << 11)
-  constexpr uint32_t sqHwRegMode = 1;
-  constexpr uint32_t width = 2;
-  constexpr uint32_t offset = 0;
-
   enum OperationType : uint32_t { Add = 0, Sub, Mul };
   auto func = inst.getCalledFunction();
   auto retType = func->getReturnType();
@@ -342,8 +333,12 @@ void LowerGpuRt::visitFloatWithRoundMode(lgc::GpurtFloatWithRoundModeOp &inst) {
   uint32_t rm = cast<ConstantInt>(inst.getRoundMode())->getZExtValue();
   uint32_t op = cast<ConstantInt>(inst.getOperation())->getZExtValue();
 
-  // WARNING: This isn't supported robustly by the IR semantics and the backend, but it's the best we can do for now.
-  BuilderBase::get(*m_builder).CreateSetReg(sqHwRegMode, offset, width, m_builder->getInt32(rm));
+  static constexpr RoundingMode rmTable[4] = {RoundingMode::NearestTiesToEven, RoundingMode::TowardPositive,
+                                              RoundingMode::TowardNegative, RoundingMode::TowardZero};
+
+  // Use llvm.set.rounding to modify rounding mode.
+  m_builder->CreateIntrinsic(m_builder->getVoidTy(), Intrinsic::set_rounding,
+                             {m_builder->getInt32(static_cast<unsigned>(rmTable[rm]))});
 
   Value *result = PoisonValue::get(retType);
   if (op == OperationType::Add)
@@ -353,9 +348,9 @@ void LowerGpuRt::visitFloatWithRoundMode(lgc::GpurtFloatWithRoundModeOp &inst) {
   else
     result = m_builder->CreateFMul(src0, src1);
 
-  // set back to RoundTiesToEven.
-  uint32_t roundTiesToEven = 1;
-  BuilderBase::get(*m_builder).CreateSetReg(sqHwRegMode, offset, width, m_builder->getInt32(roundTiesToEven));
+  // Set back to RoundTiesToEven.
+  m_builder->CreateIntrinsic(m_builder->getVoidTy(), Intrinsic::set_rounding,
+                             {m_builder->getInt32(static_cast<unsigned>(RoundingMode::NearestTiesToEven))});
 
   inst.replaceAllUsesWith(result);
   m_callsToLower.push_back(&inst);
diff --git a/lgc/patch/LowerInOut.cpp b/lgc/patch/LowerInOut.cpp
index 46b3bb1e56..e887512f7e 100644
--- a/lgc/patch/LowerInOut.cpp
+++ b/lgc/patch/LowerInOut.cpp
@@ -32,12 +32,14 @@
 #include "lgc/patch/LowerInOut.h"
 #include "lgc/Builder.h"
 #include "lgc/BuiltIns.h"
+#include "lgc/Debug.h"
 #include "lgc/LgcDialect.h"
+#include "lgc/builder/BuilderImpl.h"
 #include "lgc/state/AbiUnlinked.h"
 #include "lgc/state/PalMetadata.h"
 #include "lgc/state/PipelineShaders.h"
-#include "lgc/util/Debug.h"
 #include "lgc/util/WorkgroupLayout.h"
+#include "llvm-dialects/Dialect/Visitor.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/Support/Debug.h"
@@ -100,6 +102,17 @@ PreservedAnalyses LowerInOut::run(Module &module, ModuleAnalysisManager &analysi
   m_gfxIp = m_pipelineState->getTargetInfo().getGfxIpVersion();
   m_pipelineSysValues.initialize(m_pipelineState);
 
+  auto entryPoint = pipelineShaders.getEntryPoint(ShaderStage::Fragment);
+  if (entryPoint) {
+    m_entryPoint = entryPoint;
+    m_shaderStage = ShaderStage::Fragment;
+    const auto fetchVisitor = llvm_dialects::VisitorBuilder<LowerInOut>()
+                                  .add(&LowerInOut::visitEvalIjOffsetSmoothOp)
+                                  .add(&LowerInOut::visitAdjustIjOp)
+                                  .build();
+    fetchVisitor.visit(*this, module);
+  }
+
   const auto stageMask = m_pipelineState->getShaderStageMask();
   m_hasTs = stageMask.contains_any({ShaderStage::TessControl, ShaderStage::TessEval});
   m_hasGs = stageMask.contains(ShaderStage::Geometry);
@@ -109,7 +122,7 @@ PreservedAnalyses LowerInOut::run(Module &module, ModuleAnalysisManager &analysi
     auto name = func.getName();
     if (name.starts_with("lgc.input"))
       inputCallees.push_back(&func);
-    else if (name.starts_with("lgc.output") || name == "llvm.amdgcn.s.sendmsg")
+    else if (name.starts_with("lgc.output") || name.starts_with("lgc.gs") || name == "lgc.write.xfb.output")
       otherCallees.push_back(&func);
   }
 
@@ -170,6 +183,12 @@ PreservedAnalyses LowerInOut::run(Module &module, ModuleAnalysisManager &analysi
   }
   m_exportCalls.clear();
 
+  for (auto callInst : m_gsMsgCalls) {
+    callInst->dropAllReferences();
+    callInst->eraseFromParent();
+  }
+  m_gsMsgCalls.clear();
+
   m_pipelineSysValues.clear();
 
   return PreservedAnalyses::none();
@@ -201,6 +220,10 @@ void LowerInOut::processFunction(Function &func, ShaderStageEnum shaderStage, Sm
 // @param [in/out] func : LLVM function to be run on
 // @param postDomTree : The PostDominatorTree of the \p func
 void LowerInOut::markExportDone(Function *func, PostDominatorTree &postDomTree) {
+  // Position export in NGG primitive shader is handled later on. Here we only process position export in legacy HW VS.
+  if (m_pipelineState->getNggControl()->enableNgg)
+    return;
+
   SmallVector<CallInst *, 4> expInsts;
 
   Function *expDecl = m_module->getFunction("llvm.amdgcn.exp.f32");
@@ -269,89 +292,100 @@ void LowerInOut::processShader() {
     m_threadId = getSubgroupLocalInvocationId(builder);
   }
 
-  // Initialize calculation factors for tessellation shader
+  // Initialize HW configurations for tessellation shaders
   if (m_shaderStage == ShaderStage::TessControl || m_shaderStage == ShaderStage::TessEval) {
     const auto stageMask = m_pipelineState->getShaderStageMask();
     const bool hasTcs = stageMask.contains(ShaderStage::TessControl);
 
-    auto &calcFactor = m_pipelineState->getShaderResourceUsage(ShaderStage::TessControl)->inOutUsage.tcs.calcFactor;
-    if (!calcFactor.initialized) {
-      calcFactor.initialized = true;
+    auto &hwConfig = m_pipelineState->getShaderResourceUsage(ShaderStage::TessControl)->inOutUsage.tcs.hwConfig;
+    if (!hwConfig.initialized) {
+      hwConfig.initialized = true;
 
       //
       // NOTE: The LDS for tessellation is as follow:
       //
-      //          +-------------+----------------+------------------+-------------+
-      // On-chip  | Tess Factor | HS Patch Count | Special TF Value | Input Patch | (LDS)
-      //          +-------------+----------------+------------------+-------------+
+      //          +----------------+------------------+--------------+-------------+-------------+-------------+
+      // On-chip  | HS Patch Count | Special TF Value | Output Patch | Patch Const | Tess Factor | Input Patch | (LDS)
+      //          +----------------+------------------+--------------+-------------+-------------+-------------+
       //
       //          +--------------+-------------+
       // Off-chip | Output Patch | Patch Const | (LDS Buffer)
       //          +--------------+-------------+
       //
-      // inPatchTotalSize = inVertexCount * inVertexStride * patchCountPerThreadGroup
-      // outPatchTotalSize = outVertexCount * outVertexStride * patchCountPerThreadGroup
-      // patchConstTotalSize = patchConstCount * 4 * patchCountPerThreadGroup
-      // tessFactorTotalSize = 6 * patchCountPerThreadGroup
+      // inputPatchTotalSize = inputVertexCount * inputVertexStride * maxNumHsPatchesPerGroup
+      // outputPatchTotalSize = outputVertexCount * outputVertexStride * maxNumHsPatchesPerGroup
+      // patchConstTotalSize = patchConstCount * 4 * maxNumHsPatchesPerGroup
+      // tessFactorTotalSize = 6 * maxNumHsPatchesPerGroup
       //
       const auto &tcsInOutUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::TessControl)->inOutUsage;
       const auto &tesInOutUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::TessEval)->inOutUsage;
 
-      const unsigned inLocCount = std::max(tcsInOutUsage.inputMapLocCount, 1u);
-      const unsigned outLocCount =
+      const unsigned inputLocCount = std::max(tcsInOutUsage.inputMapLocCount, 1u);
+      const unsigned outputLocCount =
           hasTcs ? std::max(tcsInOutUsage.outputMapLocCount, 1u) : std::max(tesInOutUsage.inputMapLocCount, 1u);
 
-      const unsigned inVertexCount = m_pipelineState->getNumPatchControlPoints();
-      const unsigned outVertexCount =
+      const unsigned inputVertexCount = m_pipelineState->getNumPatchControlPoints();
+      const unsigned outputVertexCount =
           hasTcs ? m_pipelineState->getShaderModes()->getTessellationMode().outputVertices : MaxTessPatchVertices;
 
-      unsigned tessFactorStride = 0;
+      unsigned tessFactorCount = 0;
       switch (m_pipelineState->getShaderModes()->getTessellationMode().primitiveMode) {
       case PrimitiveMode::Triangles:
-        tessFactorStride = 4;
+        tessFactorCount = 4;
         break;
       case PrimitiveMode::Quads:
-        tessFactorStride = 6;
+        tessFactorCount = 6;
         break;
       case PrimitiveMode::Isolines:
-        tessFactorStride = 2;
+        tessFactorCount = 2;
         break;
       default:
         llvm_unreachable("Should never be called!");
         break;
       }
+      // Use odd-dword stride to avoid LDS bank conflict
+      assert(tessFactorCount % 2 == 0);
+      hwConfig.onChip.tessFactorStride = tessFactorCount + 1;
 
-      calcFactor.inVertexStride = inLocCount * 4;
-      calcFactor.outVertexStride = outLocCount * 4;
+      // Use odd-dword stride to avoid LDS bank conflict
+      hwConfig.onChip.inputVertexStride = (inputLocCount * 4) | 1;
+      hwConfig.onChip.inputPatchSize = inputVertexCount * hwConfig.onChip.inputVertexStride;
 
-      const unsigned patchConstCount =
-          hasTcs ? tcsInOutUsage.perPatchOutputMapLocCount : tesInOutUsage.perPatchInputMapLocCount;
-      calcFactor.patchConstSize = patchConstCount * 4;
+      hwConfig.onChip.outputVertexStride = (outputLocCount * 4) | 1;
+      hwConfig.onChip.outputPatchSize = outputVertexCount * hwConfig.onChip.outputVertexStride;
 
-      calcFactor.patchCountPerThreadGroup =
-          calcPatchCountPerThreadGroup(inVertexCount, calcFactor.inVertexStride, outVertexCount,
-                                       calcFactor.outVertexStride, patchConstCount, tessFactorStride);
+      hwConfig.offChip.outputVertexStride = outputLocCount * 4;
+      hwConfig.offChip.outputPatchSize = outputVertexCount * hwConfig.offChip.outputVertexStride;
 
-      const unsigned inPatchSize = inVertexCount * calcFactor.inVertexStride;
-      const unsigned inPatchTotalSize = calcFactor.patchCountPerThreadGroup * inPatchSize;
+      const unsigned patchConstCount =
+          hasTcs ? tcsInOutUsage.perPatchOutputMapLocCount : tesInOutUsage.perPatchInputMapLocCount;
+      // Use odd-dword stride to avoid LDS bank conflict
+      hwConfig.onChip.patchConstSize = 0;
+      hwConfig.offChip.patchConstSize = 0;
+      if (patchConstCount > 0) {
+        hwConfig.onChip.patchConstSize = (patchConstCount * 4) | 1;
+        hwConfig.offChip.patchConstSize = patchConstCount * 4;
+      }
 
-      const unsigned outPatchSize = outVertexCount * calcFactor.outVertexStride;
-      const unsigned outPatchTotalSize = calcFactor.patchCountPerThreadGroup * outPatchSize;
+      const unsigned ldsSizePerPatch = hwConfig.onChip.outputPatchSize + hwConfig.onChip.patchConstSize +
+                                       hwConfig.onChip.tessFactorStride + hwConfig.onChip.inputPatchSize;
+      const unsigned ldsBufferSizePerPatch = hwConfig.offChip.outputPatchSize + hwConfig.offChip.patchConstSize;
+      hwConfig.maxNumPatchesPerGroup = calcMaxNumPatchesPerGroup(inputVertexCount, outputVertexCount, tessFactorCount,
+                                                                 ldsSizePerPatch, ldsBufferSizePerPatch);
 
-      const unsigned patchConstTotalSize = calcFactor.patchCountPerThreadGroup * calcFactor.patchConstSize;
-      const unsigned tessFactorTotalSize = calcFactor.patchCountPerThreadGroup * MaxTessFactorsPerPatch;
+      const unsigned onChipOutputPatchTotalSize = hwConfig.maxNumPatchesPerGroup * hwConfig.onChip.outputPatchSize;
+      const unsigned offChipOutputPatchTotalSize = hwConfig.maxNumPatchesPerGroup * hwConfig.offChip.outputPatchSize;
 
-      calcFactor.outPatchSize = outPatchSize;
-      calcFactor.inPatchSize = inPatchSize;
+      const unsigned onChipPatchConstTotalSize = hwConfig.maxNumPatchesPerGroup * hwConfig.onChip.patchConstSize;
+      const unsigned offChipPatchConstTotalSize = hwConfig.maxNumPatchesPerGroup * hwConfig.offChip.patchConstSize;
 
-      // NOTE: Tess factors are always stored to on-chip LDS first. Then, they are store to TF buffer and off-chip
-      // LDS buffer (which will be loaded by TES).
-      calcFactor.offChip.outPatchStart = 0;
-      calcFactor.offChip.patchConstStart = calcFactor.offChip.outPatchStart + outPatchTotalSize;
+      const unsigned inputPatchTotalSize = hwConfig.maxNumPatchesPerGroup * hwConfig.onChip.inputPatchSize;
+      const unsigned tessFactorTotalSize = hwConfig.maxNumPatchesPerGroup * hwConfig.onChip.tessFactorStride;
 
-      calcFactor.tessFactorStride = tessFactorStride;
-      calcFactor.onChip.inPatchStart = tessFactorTotalSize;
-      calcFactor.tessOnChipLdsSize = tessFactorTotalSize + inPatchTotalSize;
+      // NOTE: Tess factors and TCS outputs are always stored to on-chip LDS first. Then, they are store to TF buffer
+      // and off-chip LDS buffer (which will be loaded by TES).
+      hwConfig.offChip.outputPatchStart = 0;
+      hwConfig.offChip.patchConstStart = hwConfig.offChip.outputPatchStart + offChipOutputPatchTotalSize;
 
       if (m_pipelineState->canOptimizeTessFactor()) {
         //
@@ -365,75 +399,90 @@ void LowerInOut::processShader() {
         //                  |<---- Wave 0 --->|     |<---- Wave N --->|
         //
         assert(m_gfxIp.major >= 11);
-        calcFactor.onChip.hsPatchCountStart = calcFactor.onChip.inPatchStart; // One dword to store actual HS wave count
-        calcFactor.onChip.specialTfValueStart = calcFactor.onChip.hsPatchCountStart + 1;
+        hwConfig.onChip.hsPatchCountStart = 0; // One dword to store actual HS patch count
+        hwConfig.onChip.specialTfValueStart = hwConfig.onChip.hsPatchCountStart + 1;
 
         const unsigned maxNumHsWaves =
-            MaxHsThreadsPerSubgroup / m_pipelineState->getMergedShaderWaveSize(ShaderStage::TessControl);
-        calcFactor.specialTfValueSize = maxNumHsWaves * 2;
-        calcFactor.tessOnChipLdsSize += 1 + calcFactor.specialTfValueSize;
-        calcFactor.onChip.inPatchStart += 1 + calcFactor.specialTfValueSize;
+            MaxHsThreadsPerSubgroup / m_pipelineState->getShaderWaveSize(ShaderStage::TessControl);
+        hwConfig.onChip.specialTfValueSize = maxNumHsWaves * 2;
       }
 
+      hwConfig.onChip.outputPatchStart = hwConfig.onChip.specialTfValueStart + hwConfig.onChip.specialTfValueSize;
+      hwConfig.onChip.patchConstStart = hwConfig.onChip.outputPatchStart + onChipOutputPatchTotalSize;
+      hwConfig.onChip.tessFactorStart = hwConfig.onChip.patchConstStart + onChipPatchConstTotalSize;
+      hwConfig.onChip.inputPatchStart = hwConfig.onChip.tessFactorStart + tessFactorTotalSize;
+
+      hwConfig.tessOnChipLdsSize = hwConfig.onChip.inputPatchStart + inputPatchTotalSize;
+
       // NOTE: If ray query uses LDS stack, the expected max thread count in the group is 64. And we force wave size
       // to be 64 in order to keep all threads in the same wave. In the future, we could consider to get rid of this
       // restriction by providing the capability of querying thread ID in group rather than in wave.
       const auto vsResUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::Vertex);
       const auto tcsResUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::TessControl);
       if (vsResUsage->useRayQueryLdsStack || tcsResUsage->useRayQueryLdsStack)
-        calcFactor.rayQueryLdsStackSize = MaxRayQueryLdsStackEntries * MaxRayQueryThreadsPerGroup;
+        hwConfig.rayQueryLdsStackSize = MaxRayQueryLdsStackEntries * MaxRayQueryThreadsPerGroup;
+
+      // Make sure we don't run out of LDS space.
+      assert(hwConfig.tessOnChipLdsSize + hwConfig.rayQueryLdsStackSize <=
+             m_pipelineState->getTargetInfo().getGpuProperty().ldsSizePerThreadGroup);
+
+      auto printLdsLayout = [=](const char *name, unsigned offset, unsigned size) {
+        if (size != 0) {
+          LLPC_OUTS(format("%-30s : offset = 0x%04" PRIX32 ", size = 0x%04" PRIX32, name, offset, size));
+          LLPC_OUTS("\n");
+        }
+      };
 
       LLPC_OUTS("===============================================================================\n");
-      LLPC_OUTS("// LLPC tessellation calculation factor results\n\n");
-      LLPC_OUTS("Patch count per thread group: " << calcFactor.patchCountPerThreadGroup << "\n");
-      LLPC_OUTS("\n");
-      LLPC_OUTS("Tess factor start: 0 (LDS)\n");
-      LLPC_OUTS("Tess factor total size (in dwords): " << tessFactorTotalSize << "\n");
-      LLPC_OUTS("\n");
-      LLPC_OUTS("Input vertex count: " << inVertexCount << "\n");
-      LLPC_OUTS("Input vertex stride: " << calcFactor.inVertexStride << "\n");
-      LLPC_OUTS("Input patch size (in dwords): " << inPatchSize << "\n");
-      LLPC_OUTS("Input patch start: " << calcFactor.onChip.inPatchStart << " (LDS)\n");
-      LLPC_OUTS("Input patch total size (in dwords): " << inPatchTotalSize << "\n");
-      LLPC_OUTS("\n");
-      LLPC_OUTS("Output vertex count: " << outVertexCount << "\n");
-      LLPC_OUTS("Output vertex stride: " << calcFactor.outVertexStride << "\n");
-      LLPC_OUTS("Output patch size (in dwords): " << outPatchSize << "\n");
-      LLPC_OUTS("Output patch start: " << calcFactor.offChip.outPatchStart << " (LDS buffer)\n");
-      LLPC_OUTS("Output patch total size (in dwords): " << outPatchTotalSize << "\n");
-      LLPC_OUTS("\n");
-      LLPC_OUTS("Patch constant count: " << patchConstCount << "\n");
-      LLPC_OUTS("Patch constant size (in dwords): " << calcFactor.patchConstSize << "\n");
-      LLPC_OUTS("Patch constant start: " << calcFactor.offChip.patchConstStart << " (LDS buffer)\n");
-      LLPC_OUTS("Patch constant total size (in dwords): " << patchConstTotalSize << "\n");
-      LLPC_OUTS("\n");
-      LLPC_OUTS("HS patch count start: " << calcFactor.onChip.hsPatchCountStart << " (LDS)\n");
-      LLPC_OUTS("HS wave count size (in dwords): " << 1 << "\n");
-      LLPC_OUTS("\n");
-      LLPC_OUTS("Special TF value start: " << calcFactor.onChip.specialTfValueStart << " (LDS)\n");
-      LLPC_OUTS("Special TF value size (in dwords): " << calcFactor.specialTfValueSize << "\n");
-      LLPC_OUTS("\n");
-      LLPC_OUTS("Tess factor stride: " << tessFactorStride << " (");
+      LLPC_OUTS("// LLPC HW tessellation configurations\n\n");
+      LLPC_OUTS("MaxNumPatchesPerGroup = " << hwConfig.maxNumPatchesPerGroup << "\n");
+      LLPC_OUTS("Primitive = ");
       switch (m_pipelineState->getShaderModes()->getTessellationMode().primitiveMode) {
       case PrimitiveMode::Triangles:
-        LLPC_OUTS("triangles");
+        LLPC_OUTS("Triangles");
         break;
       case PrimitiveMode::Quads:
-        LLPC_OUTS("quads");
+        LLPC_OUTS("Quads");
         break;
       case PrimitiveMode::Isolines:
-        LLPC_OUTS("isolines");
+        LLPC_OUTS("Isolines");
         break;
       default:
         llvm_unreachable("Should never be called!");
         break;
       }
-      LLPC_OUTS(")\n\n");
-      LLPC_OUTS("Tess on-chip LDS total size (in dwords): " << calcFactor.tessOnChipLdsSize << "\n");
-      if (calcFactor.rayQueryLdsStackSize > 0) {
-        LLPC_OUTS("Ray query LDS stack size (in dwords): " << calcFactor.rayQueryLdsStackSize
-                                                           << " (start = " << calcFactor.tessOnChipLdsSize << ")\n");
+      LLPC_OUTS(" (HW TFs = " << tessFactorCount << " dwords)\n");
+      LLPC_OUTS("TF0/TF1 Messaging = " << (m_pipelineState->canOptimizeTessFactor() ? "true" : "false") << "\n");
+      LLPC_OUTS("\n");
+      LLPC_OUTS("Tessellator Patch:\n");
+      LLPC_OUTS("InputVertices = " << inputVertexCount << ", VertexStride = " << hwConfig.onChip.inputVertexStride
+                                   << " dwords, Size = " << hwConfig.onChip.inputPatchSize << " dwords\n");
+      LLPC_OUTS("OutputVertices = " << outputVertexCount << ", VertexStride = [" << hwConfig.onChip.outputVertexStride
+                                    << ", " << hwConfig.offChip.outputVertexStride << "] dwords, Size = ["
+                                    << hwConfig.onChip.outputPatchSize << ", " << hwConfig.offChip.outputPatchSize
+                                    << "] dwords\n");
+      LLPC_OUTS("PatchConstants = " << patchConstCount << ", Size = [" << hwConfig.onChip.patchConstSize << ", "
+                                    << hwConfig.offChip.patchConstSize << "] dwords\n");
+
+      LLPC_OUTS("\n");
+      LLPC_OUTS("Onchip LDS Layout (in dwords):\n");
+      if (m_pipelineState->canOptimizeTessFactor()) {
+        printLdsLayout("HS Patch Count", hwConfig.onChip.hsPatchCountStart, 1);
+        printLdsLayout("Special TF Values", hwConfig.onChip.specialTfValueStart, hwConfig.onChip.specialTfValueSize);
       }
+      printLdsLayout("Output Patches", hwConfig.onChip.outputPatchStart, onChipOutputPatchTotalSize);
+      printLdsLayout("Patch Constants", hwConfig.onChip.patchConstStart, onChipPatchConstTotalSize);
+      printLdsLayout("TFs", hwConfig.onChip.tessFactorStart, tessFactorTotalSize);
+      printLdsLayout("Input Patches", hwConfig.onChip.inputPatchStart, inputPatchTotalSize);
+      if (hwConfig.rayQueryLdsStackSize > 0)
+        printLdsLayout("Ray Query Stack", hwConfig.tessOnChipLdsSize, hwConfig.rayQueryLdsStackSize);
+      LLPC_OUTS("Total Onchip LDS = " << hwConfig.tessOnChipLdsSize + hwConfig.rayQueryLdsStackSize << " dwords\n");
+      LLPC_OUTS("\n");
+      LLPC_OUTS("Offchip LDS Buffer Layout (in dwords):\n");
+      printLdsLayout("Output Patches", hwConfig.offChip.outputPatchStart, offChipOutputPatchTotalSize);
+      printLdsLayout("Patch Constants", hwConfig.offChip.patchConstStart, offChipPatchConstTotalSize);
+      LLPC_OUTS("Total Offchip LDS Buffer = " << offChipOutputPatchTotalSize + offChipPatchConstTotalSize
+                                              << " dwords\n");
       LLPC_OUTS("\n");
     }
   }
@@ -527,11 +576,10 @@ void LowerInOut::visitCallInst(CallInst &callInst) {
 
   auto exportGenericOutput = lgcName::OutputExportGeneric;
   auto exportBuiltInOutput = lgcName::OutputExportBuiltIn;
-  auto exportXfbOutput = lgcName::OutputExportXfb;
 
   const bool isGenericOutputExport = mangledName.starts_with(exportGenericOutput);
   const bool isBuiltInOutputExport = mangledName.starts_with(exportBuiltInOutput);
-  const bool isXfbOutputExport = mangledName.starts_with(exportXfbOutput);
+  const bool isXfbOutputExport = isa<WriteXfbOutputOp>(callInst);
 
   const bool isExport = (isGenericOutputExport || isBuiltInOutputExport || isXfbOutputExport);
 
@@ -952,7 +1000,7 @@ void LowerInOut::visitCallInst(CallInst &callInst) {
         exist = true;
         loc = value;
       } else {
-        // Generic output exports of FS should have been handled by the LowerFragColorExport pass
+        // Generic output exports of FS should have been handled by the LowerFragmentColorExport pass
         assert(m_shaderStage == ShaderStage::Vertex || m_shaderStage == ShaderStage::Geometry ||
                m_shaderStage == ShaderStage::TessEval);
 
@@ -1039,69 +1087,93 @@ void LowerInOut::visitCallInst(CallInst &callInst) {
     }
   } else {
     // Other calls relevant to input/output import/export
-    if (callee->isIntrinsic() && callee->getIntrinsicID() == Intrinsic::amdgcn_s_sendmsg) {
-      unsigned emitStream = InvalidValue;
-      uint64_t message = cast<ConstantInt>(callInst.getArgOperand(0))->getZExtValue();
-      if (message == GsEmitStream0 || message == GsEmitStream1 || message == GsEmitStream2 ||
-          message == GsEmitStream3) {
-        // NOTE: MSG[9:8] = STREAM_ID
-        emitStream = (message & GsEmitCutStreamIdMask) >> GsEmitCutStreamIdShift;
-      }
+    if (isa<GsEmitStreamOp>(callInst)) {
+      assert(m_shaderStage == ShaderStage::Geometry); // Must be geometry shader
 
-      if (emitStream != InvalidValue) {
-        assert(m_shaderStage == ShaderStage::Geometry); // Must be geometry shader
+      const unsigned streamId = cast<GsEmitStreamOp>(callInst).getStreamId();
+      assert(streamId < MaxGsStreams);
 
-        // NOTE: Implicitly store the value of view index to GS-VS ring buffer for raster stream if multi-view is
-        // enabled. Copy shader will read the value from GS-VS ring and export it to vertex position data.
-        if (m_pipelineState->getInputAssemblyState().multiView != MultiViewMode::Disable) {
-          auto resUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry);
-          auto rasterStream = m_pipelineState->getRasterizerState().rasterStream;
+      // NOTE: Implicitly store the value of view index to GS-VS ring buffer for raster stream if multi-view is
+      // enabled. Copy shader will read the value from GS-VS ring and export it to vertex position data.
+      if (m_pipelineState->getInputAssemblyState().multiView != MultiViewMode::Disable) {
+        auto resUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry);
+        auto rasterStream = m_pipelineState->getRasterizerState().rasterStream;
 
-          if (emitStream == rasterStream) {
-            // When multiview and viewIndexFromDeviceIndex enable, it can't use the device id
-            // as viewId to storeValueToGsVsRing when multiview in the same device
-            auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(ShaderStage::Geometry)->entryArgIdxs.gs;
-            auto viewIndex = getFunctionArgument(m_entryPoint, entryArgIdxs.viewId);
+        if (streamId == rasterStream) {
+          // When multiview and viewIndexFromDeviceIndex enable, it can't use the device ID
+          // as viewId to storeValueToGsVsRing when multiview in the same device
+          auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(ShaderStage::Geometry)->entryArgIdxs.gs;
+          auto viewIndex = getFunctionArgument(m_entryPoint, entryArgIdxs.viewId);
 
-            const auto &builtInOutLocMap = resUsage->inOutUsage.builtInOutputLocMap;
-            assert(builtInOutLocMap.find(BuiltInViewIndex) != builtInOutLocMap.end());
-            unsigned loc = builtInOutLocMap.find(BuiltInViewIndex)->second;
+          const auto &builtInOutLocMap = resUsage->inOutUsage.builtInOutputLocMap;
+          assert(builtInOutLocMap.find(BuiltInViewIndex) != builtInOutLocMap.end());
+          unsigned loc = builtInOutLocMap.find(BuiltInViewIndex)->second;
 
-            storeValueToGsVsRing(viewIndex, loc, 0, rasterStream, builder);
-          }
+          storeValueToGsVsRing(viewIndex, loc, 0, rasterStream, builder);
         }
+      }
 
-        // Increment emit vertex counter
-        auto emitCounterPair = m_pipelineSysValues.get(m_entryPoint)->getEmitCounterPtr();
-        auto emitCounterTy = emitCounterPair.first;
-        auto emitCounterPtr = emitCounterPair.second[emitStream];
-        Value *emitCounter = builder.CreateLoad(emitCounterTy, emitCounterPtr);
-        emitCounter = builder.CreateAdd(emitCounter, builder.getInt32(1));
-        builder.CreateStore(emitCounter, emitCounterPtr);
-
-        // Increment total emit vertex counter
-        if (m_pipelineState->getShaderModes()->getGeometryShaderMode().robustGsEmits) {
-          auto totalEmitCounterPtr = m_pipelineSysValues.get(m_entryPoint)->getTotalEmitCounterPtr();
-          Value *totalEmitCounter = builder.CreateLoad(builder.getInt32Ty(), totalEmitCounterPtr);
-
-          // totalEmitCounter++
-          totalEmitCounter = builder.CreateAdd(totalEmitCounter, builder.getInt32(1));
-          builder.CreateStore(totalEmitCounter, totalEmitCounterPtr);
-
-          if (m_gfxIp.major < 11) {
-            // NOTE: For pre-GFX11, the counters of primitives written are driven by the message GS_EMIT/GS_CUT.
-            // Therefore, we must send such message conditionally by checking if the emit is within expected range.
-
-            // validEmit = totalEmitCounter <= outputVertices
-            const auto &geometryMode = m_pipelineState->getShaderModes()->getGeometryShaderMode();
-            auto validEmit = builder.CreateICmpULE(totalEmitCounter, builder.getInt32(geometryMode.outputVertices));
-
-            // Send the GS_EMIT message conditionally
-            builder.CreateIf(validEmit, false);
-            callInst.moveBefore(&*builder.GetInsertPoint());
-          }
+      // Increment emit counter
+      auto emitCounterPair = m_pipelineSysValues.get(m_entryPoint)->getEmitCounterPtr();
+      auto emitCounterTy = emitCounterPair.first;
+      auto emitCounterPtr = emitCounterPair.second[streamId];
+      Value *emitCounter = builder.CreateLoad(emitCounterTy, emitCounterPtr);
+      emitCounter = builder.CreateAdd(emitCounter, builder.getInt32(1));
+      builder.CreateStore(emitCounter, emitCounterPtr);
+
+      // Increment total emit counter
+      if (m_pipelineState->getShaderModes()->getGeometryShaderMode().robustGsEmits) {
+        auto totalEmitCounterPtr = m_pipelineSysValues.get(m_entryPoint)->getTotalEmitCounterPtr();
+        Value *totalEmitCounter = builder.CreateLoad(builder.getInt32Ty(), totalEmitCounterPtr);
+
+        // totalEmitCounter++
+        totalEmitCounter = builder.CreateAdd(totalEmitCounter, builder.getInt32(1));
+        builder.CreateStore(totalEmitCounter, totalEmitCounterPtr);
+
+        if (!m_pipelineState->getNggControl()->enableNgg) {
+          // NOTE: For legacy GS, the counters of primitives written are driven by the message GS_EMIT/GS_CUT.
+          // Therefore, we must send such message conditionally by checking if the emit is within expected range.
+          assert(m_gfxIp.major < 11);
+
+          // validEmit = totalEmitCounter <= outputVertices
+          const auto &geometryMode = m_pipelineState->getShaderModes()->getGeometryShaderMode();
+          auto validEmit = builder.CreateICmpULE(totalEmitCounter, builder.getInt32(geometryMode.outputVertices));
+
+          // Send the GS_EMIT message conditionally
+          builder.CreateIf(validEmit, false);
+          callInst.moveBefore(&*builder.GetInsertPoint());
+          builder.SetInsertPoint(&callInst); // Restore insert point modified by CreateIf
         }
       }
+
+      // For legacy GS, lower the dialect op GsEmitStreamOp to sendmsg intrinsic
+      if (!m_pipelineState->getNggControl()->enableNgg) {
+        m_gsMsgCalls.push_back(&callInst);
+
+        auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(ShaderStage::Geometry)->entryArgIdxs.gs;
+        auto gsWaveId = getFunctionArgument(m_entryPoint, entryArgIdxs.gsWaveId);
+
+        // [9:8] = stream, [5:4] = 2 (emit), [3:0] = 2 (GS)
+        unsigned msg = (streamId << 8) | GsEmit;
+        builder.CreateIntrinsic(Intrinsic::amdgcn_s_sendmsg, {}, {builder.getInt32(msg), gsWaveId}, nullptr);
+      }
+    } else if (isa<GsCutStreamOp>(callInst)) {
+      assert(m_shaderStage == ShaderStage::Geometry); // Must be geometry shader
+
+      const unsigned streamId = cast<GsCutStreamOp>(callInst).getStreamId();
+      assert(streamId < MaxGsStreams);
+
+      // For legacy GS, lower the dialect op GsCutStreamOp to sendmsg intrinsic
+      if (!m_pipelineState->getNggControl()->enableNgg) {
+        m_gsMsgCalls.push_back(&callInst);
+
+        auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(ShaderStage::Geometry)->entryArgIdxs.gs;
+        auto gsWaveId = getFunctionArgument(m_entryPoint, entryArgIdxs.gsWaveId);
+
+        // [9:8] = stream, [5:4] = 1 (cut), [3:0] = 2 (GS)
+        unsigned msg = (streamId << 8) | GsCut;
+        builder.CreateIntrinsic(Intrinsic::amdgcn_s_sendmsg, {}, {builder.getInt32(msg), gsWaveId}, nullptr);
+      }
     }
   }
 }
@@ -1267,19 +1339,8 @@ void LowerInOut::visitReturnInst(ReturnInst &retInst) {
     const auto &nextBuiltInUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::Fragment)->builtInUsage.fs;
 
     // NOTE: If gl_Position is not present in this shader stage, we have to export a dummy one.
-    if (!usePosition) {
-      Value *args[] = {
-          builder.getInt32(EXP_TARGET_POS_0), // tgt
-          builder.getInt32(0xF),              // en
-          zero,                               // src0
-          zero,                               // src1
-          zero,                               // src2
-          one,                                // src3
-          builder.getInt1(false),             // done
-          builder.getInt1(false)              // vm
-      };
-      builder.CreateIntrinsic(builder.getVoidTy(), Intrinsic::amdgcn_exp, args);
-    }
+    if (!usePosition)
+      exportPosition(0, {zero, zero, zero, one}, builder);
 
     // NOTE: In such case, last shader in the pre-rasterization doesn't export layer while fragment shader expects to
     // read it. Should export 0 to fragment shader, which is required by the spec.
@@ -1306,7 +1367,7 @@ void LowerInOut::visitReturnInst(ReturnInst &retInst) {
         cullDistance.push_back(builder.CreateExtractValue(m_cullDistance, i));
 
       // Merge gl_ClipDistance[] and gl_CullDistance[]
-      std::vector<Value *> clipCullDistance;
+      SmallVector<Value *, 8> clipCullDistance;
       clipCullDistance.reserve(clipDistance.size() + cullDistance.size());
       for (auto clipDistanceElement : clipDistance)
         clipCullDistance.push_back(clipDistanceElement);
@@ -1326,41 +1387,22 @@ void LowerInOut::visitReturnInst(ReturnInst &retInst) {
       bool miscExport =
           usePointSize || useLayer || useViewportIndex || useShadingRate || enableMultiView || useEdgeFlag;
       // NOTE: When misc. export is present, gl_ClipDistance[] or gl_CullDistance[] should start from pos2.
-      unsigned pos = miscExport ? EXP_TARGET_POS_2 : EXP_TARGET_POS_1;
+      unsigned exportSlot = miscExport ? 2 : 1;
 
       unsigned clipPlaneMask = m_pipelineState->getOptions().clipPlaneMask;
       bool needMapClipDistMask = ((clipPlaneMask != 0) && m_pipelineState->getOptions().enableMapClipDistMask);
       assert(!m_pipelineState->getOptions().enableMapClipDistMask || ((clipPlaneMask & 0xF) == 0));
 
-      Value *args[] = {
-          builder.getInt32(pos),  // tgt
-          builder.getInt32(0xF),  // en
-          clipCullDistance[0],    // src0
-          clipCullDistance[1],    // src1
-          clipCullDistance[2],    // src2
-          clipCullDistance[3],    // src3
-          builder.getInt1(false), // done
-          builder.getInt1(false)  // vm
-      };
-
       if (!needMapClipDistMask) {
-        builder.CreateIntrinsic(builder.getVoidTy(), Intrinsic::amdgcn_exp, args);
-        pos++;
+        exportPosition(exportSlot, {clipCullDistance[0], clipCullDistance[1], clipCullDistance[2], clipCullDistance[3]},
+                       builder);
+        exportSlot++;
       }
 
       if (clipCullDistance.size() > 4) {
         // Do the second exporting
-        Value *args[] = {
-            builder.getInt32(pos),  // tgt
-            builder.getInt32(0xF),  // en
-            clipCullDistance[4],    // src0
-            clipCullDistance[5],    // src1
-            clipCullDistance[6],    // src2
-            clipCullDistance[7],    // src3
-            builder.getInt1(false), // done
-            builder.getInt1(false)  // vm
-        };
-        builder.CreateIntrinsic(builder.getVoidTy(), Intrinsic::amdgcn_exp, args);
+        exportPosition(exportSlot, {clipCullDistance[4], clipCullDistance[5], clipCullDistance[6], clipCullDistance[7]},
+                       builder);
       }
 
       // NOTE: We have to export gl_ClipDistance[] or gl_CullDistancep[] via generic outputs as well.
@@ -1404,13 +1446,13 @@ void LowerInOut::visitReturnInst(ReturnInst &retInst) {
         assert(it != builtInOutLocs.end());
         const unsigned loc = it->second;
 
-        recordVertexAttribExport(loc,
-                                 {clipCullDistance[0], clipCullDistance[1], clipCullDistance[2], clipCullDistance[3]});
+        recordVertexAttribute(loc,
+                              {clipCullDistance[0], clipCullDistance[1], clipCullDistance[2], clipCullDistance[3]});
 
         if (clipCullDistance.size() > 4) {
           // Do the second exporting
-          recordVertexAttribExport(
-              loc + 1, {clipCullDistance[4], clipCullDistance[5], clipCullDistance[6], clipCullDistance[7]});
+          recordVertexAttribute(loc + 1,
+                                {clipCullDistance[4], clipCullDistance[5], clipCullDistance[6], clipCullDistance[7]});
         }
       }
     }
@@ -1434,14 +1476,13 @@ void LowerInOut::visitReturnInst(ReturnInst &retInst) {
         assert(m_primitiveId);
         Value *primitiveId = builder.CreateBitCast(m_primitiveId, builder.getFloatTy());
 
-        recordVertexAttribExport(loc, {primitiveId, poison, poison, poison});
+        recordVertexAttribute(loc, {primitiveId, poison, poison, poison});
       }
     }
 
     // Export EdgeFlag
-    if (useEdgeFlag) {
+    if (useEdgeFlag)
       addExportInstForBuiltInOutput(m_edgeFlag, BuiltInEdgeFlag, builder);
-    }
 
     // Export gl_Layer and gl_ViewportIndex before entry-point returns
     if (useLayer || useViewportIndex || enableMultiView) {
@@ -1482,19 +1523,7 @@ void LowerInOut::visitReturnInst(ReturnInst &retInst) {
       }
 
       viewportIndexAndLayer = builder.CreateBitCast(viewportIndexAndLayer, builder.getFloatTy());
-
-      Value *args[] = {
-          builder.getInt32(EXP_TARGET_POS_1), // tgt
-          builder.getInt32(0x4),              // en
-          poison,                             // src0
-          poison,                             // src1
-          viewportIndexAndLayer,              // src2
-          poison,                             // src3
-          builder.getInt1(false),             // done
-          builder.getInt1(false)              // vm
-      };
-
-      builder.CreateIntrinsic(Intrinsic::amdgcn_exp, builder.getFloatTy(), args, {});
+      exportPosition(1, {poison, poison, viewportIndexAndLayer, poison}, builder);
 
       // NOTE: We have to export gl_ViewportIndex via generic outputs as well.
       if (useViewportIndex) {
@@ -1511,7 +1540,7 @@ void LowerInOut::visitReturnInst(ReturnInst &retInst) {
 
           Value *viewportIndex = builder.CreateBitCast(m_viewportIndex, builder.getFloatTy());
 
-          recordVertexAttribExport(loc, {viewportIndex, poison, poison, poison});
+          recordVertexAttribute(loc, {viewportIndex, poison, poison, poison});
         }
       }
 
@@ -1530,21 +1559,13 @@ void LowerInOut::visitReturnInst(ReturnInst &retInst) {
 
           Value *layer = builder.CreateBitCast(m_layer, builder.getFloatTy());
 
-          recordVertexAttribExport(loc, {layer, poison, poison, poison});
+          recordVertexAttribute(loc, {layer, poison, poison, poison});
         }
       }
     }
 
-    // NOTE: For GFX10+, dummy generic output is no longer needed. Field NO_PC_EXPORT of SPI_VS_OUT_CONFIG
-    // will control the behavior.
-    if (m_gfxIp.major <= 9) {
-      // NOTE: If no generic outputs is present in this shader, we have to export a dummy one
-      if (inOutUsage.expCount == 0)
-        recordVertexAttribExport(0, {poison, poison, poison, poison});
-    }
-
     // Export vertex attributes that were recorded previously
-    exportVertexAttribs(builder);
+    exportAttributes(builder);
 
     if (m_pipelineState->isUnlinked()) {
       // If we are building unlinked relocatable shaders, it is possible there are
@@ -1579,7 +1600,7 @@ void LowerInOut::visitReturnInst(ReturnInst &retInst) {
       builder.CreateIntrinsic(builder.getVoidTy(), Intrinsic::amdgcn_s_sendmsg, {builder.getInt32(GsDone), gsWaveId});
     }
   } else if (m_shaderStage == ShaderStage::Fragment) {
-    // Fragment shader export are handled in LowerFragColorExport.
+    // Fragment shader export are handled in LowerFragmentColorExport.
     return;
   }
 }
@@ -1971,7 +1992,7 @@ Value *LowerInOut::patchTcsGenericOutputImport(Type *outputTy, unsigned location
                                                Value *vertexIdx, BuilderBase &builder) {
   assert(compIdx);
   auto ldsOffset = calcLdsOffsetForTcsOutput(outputTy, location, locOffset, compIdx, vertexIdx, builder);
-  return readValueFromLds(true, outputTy, ldsOffset, builder);
+  return readValueFromLds(false, outputTy, ldsOffset, builder);
 }
 
 // =====================================================================================================================
@@ -2021,9 +2042,8 @@ void LowerInOut::patchVsGenericOutputExport(Value *output, unsigned location, un
 void LowerInOut::patchTcsGenericOutputExport(Value *output, unsigned location, Value *locOffset, Value *compIdx,
                                              Value *vertexIdx, BuilderBase &builder) {
   assert(compIdx);
-  Type *outputTy = output->getType();
-  auto ldsOffset = calcLdsOffsetForTcsOutput(outputTy, location, locOffset, compIdx, vertexIdx, builder);
-  writeValueToLds(true, output, ldsOffset, builder);
+  auto ldsOffset = calcLdsOffsetForTcsOutput(output->getType(), location, locOffset, compIdx, vertexIdx, builder);
+  writeValueToLds(false, output, ldsOffset, builder);
 }
 
 // =====================================================================================================================
@@ -2161,7 +2181,7 @@ Value *LowerInOut::patchTcsBuiltInInputImport(Type *inputTy, unsigned builtInId,
         auto elemIdx = builder.getInt32(i);
         auto ldsOffset = calcLdsOffsetForTcsInput(elemTy, loc, nullptr, elemIdx, vertexIdx, builder);
         auto elem = readValueFromLds(false, elemTy, ldsOffset, builder);
-        builder.CreateInsertValue(input, elem, i);
+        input = builder.CreateInsertValue(input, elem, i);
       }
     } else {
       auto ldsOffset = calcLdsOffsetForTcsInput(inputTy, loc, nullptr, elemIdx, vertexIdx, builder);
@@ -2380,11 +2400,6 @@ Value *LowerInOut::patchGsBuiltInInputImport(Type *inputTy, unsigned builtInId,
       input = builder.getInt32(0);
     break;
   }
-  // Handle internal-use built-ins
-  case BuiltInGsWaveId: {
-    input = getFunctionArgument(m_entryPoint, entryArgIdxs.gsWaveId);
-    break;
-  }
   default: {
     llvm_unreachable("Should never be called!");
     break;
@@ -2722,6 +2737,7 @@ Value *LowerInOut::patchFsBuiltInInputImport(Type *inputTy, unsigned builtInId,
       input = builder.CreateIntrinsic(Intrinsic::amdgcn_ubfe, builder.getInt32Ty(),
                                       {sampleInfo, builder.getInt32(2), builder.getInt32(5)});
     } else {
+      assert(m_pipelineState->getRasterizerState().numSamples != 0);
       input = builder.getInt32(m_pipelineState->getRasterizerState().numSamples);
     }
     break;
@@ -2851,6 +2867,7 @@ Value *LowerInOut::patchTcsBuiltInOutputImport(Type *outputTy, unsigned builtInI
   const auto resUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::TessControl);
   const auto &builtInUsage = resUsage->builtInUsage.tcs;
   const auto &builtInOutLocMap = resUsage->inOutUsage.builtInOutputLocMap;
+  const auto &perPatchBuiltInOutLocMap = resUsage->inOutUsage.perPatchBuiltInOutputLocMap;
 
   switch (builtInId) {
   case BuiltInPosition:
@@ -2863,7 +2880,7 @@ Value *LowerInOut::patchTcsBuiltInOutputImport(Type *outputTy, unsigned builtInI
     unsigned loc = builtInOutLocMap.find(builtInId)->second;
 
     auto ldsOffset = calcLdsOffsetForTcsOutput(outputTy, loc, nullptr, elemIdx, vertexIdx, builder);
-    output = readValueFromLds(true, outputTy, ldsOffset, builder);
+    output = readValueFromLds(false, outputTy, ldsOffset, builder);
 
     break;
   }
@@ -2889,12 +2906,12 @@ Value *LowerInOut::patchTcsBuiltInOutputImport(Type *outputTy, unsigned builtInI
       for (unsigned i = 0; i < outputTy->getArrayNumElements(); ++i) {
         auto elemIdx = builder.getInt32(i);
         auto ldsOffset = calcLdsOffsetForTcsOutput(elemTy, loc, nullptr, elemIdx, vertexIdx, builder);
-        auto elem = readValueFromLds(true, elemTy, ldsOffset, builder);
+        auto elem = readValueFromLds(false, elemTy, ldsOffset, builder);
         output = builder.CreateInsertValue(output, elem, {i});
       }
     } else {
       auto ldsOffset = calcLdsOffsetForTcsOutput(outputTy, loc, nullptr, elemIdx, vertexIdx, builder);
-      output = readValueFromLds(true, outputTy, ldsOffset, builder);
+      output = readValueFromLds(false, outputTy, ldsOffset, builder);
     }
 
     break;
@@ -2905,26 +2922,19 @@ Value *LowerInOut::patchTcsBuiltInOutputImport(Type *outputTy, unsigned builtInI
     assert(builtInId != BuiltInTessLevelInner || builtInUsage.tessLevelInner);
     (void(builtInUsage)); // Unused
 
-    // tessLevelOuter (float[4]) + tessLevelInner (float[2])
-    // ldsOffset = relativeId * MaxTessFactorsPerPatch + elemIdx
-    uint32_t tessOffset = 0;
-    if (builtInId == BuiltInTessLevelInner)
-      tessOffset += 4;
-
-    auto relativeId = m_pipelineSysValues.get(m_entryPoint)->getRelativeId();
-    Value *baseOffset = builder.CreateMul(relativeId, builder.getInt32(MaxTessFactorsPerPatch));
+    assert(perPatchBuiltInOutLocMap.find(builtInId) != perPatchBuiltInOutLocMap.end());
+    unsigned loc = perPatchBuiltInOutLocMap.find(builtInId)->second;
 
     if (outputTy->isArrayTy()) {
-      // Import the whole tessLevel array
+      // Handle the whole array
       for (unsigned i = 0; i < outputTy->getArrayNumElements(); ++i) {
-        Value *ldsOffset = builder.CreateAdd(baseOffset, builder.getInt32(tessOffset + i));
+        auto ldsOffset = calcLdsOffsetForTcsOutput(outputTy, loc, nullptr, builder.getInt32(i), nullptr, builder);
         auto elem = readValueFromLds(false, builder.getFloatTy(), ldsOffset, builder);
         output = builder.CreateInsertValue(output, elem, {i});
       }
     } else {
-      // Import a single element of tessLevel array
-      Value *ldsOffset = builder.CreateAdd(baseOffset, builder.getInt32(tessOffset));
-      ldsOffset = builder.CreateAdd(ldsOffset, elemIdx);
+      // Handle a single element of the array
+      auto ldsOffset = calcLdsOffsetForTcsOutput(outputTy, loc, nullptr, elemIdx, nullptr, builder);
       output = readValueFromLds(false, outputTy, ldsOffset, builder);
     }
 
@@ -3119,6 +3129,7 @@ void LowerInOut::patchTcsBuiltInOutputExport(Value *output, unsigned builtInId,
   const auto &builtInUsage = resUsage->builtInUsage.tcs;
   const auto &builtInOutLocMap = resUsage->inOutUsage.builtInOutputLocMap;
   const auto &perPatchBuiltInOutLocMap = resUsage->inOutUsage.perPatchBuiltInOutputLocMap;
+  const auto &hwConfig = resUsage->inOutUsage.tcs.hwConfig;
 
   switch (builtInId) {
   case BuiltInPosition:
@@ -3137,7 +3148,7 @@ void LowerInOut::patchTcsBuiltInOutputExport(Value *output, unsigned builtInId,
     unsigned loc = builtInOutLocMap.find(builtInId)->second;
 
     auto ldsOffset = calcLdsOffsetForTcsOutput(outputTy, loc, nullptr, elemIdx, vertexIdx, builder);
-    writeValueToLds(true, output, ldsOffset, builder);
+    writeValueToLds(false, output, ldsOffset, builder);
 
     break;
   }
@@ -3158,56 +3169,98 @@ void LowerInOut::patchTcsBuiltInOutputExport(Value *output, unsigned builtInId,
         auto elem = builder.CreateExtractValue(output, i);
         auto elemIdx = builder.getInt32(i);
         auto ldsOffset = calcLdsOffsetForTcsOutput(elem->getType(), loc, nullptr, elemIdx, vertexIdx, builder);
-        writeValueToLds(true, elem, ldsOffset, builder);
+        writeValueToLds(false, elem, ldsOffset, builder);
       }
     } else {
       auto ldsOffset = calcLdsOffsetForTcsOutput(outputTy, loc, nullptr, elemIdx, vertexIdx, builder);
-      writeValueToLds(true, output, ldsOffset, builder);
+      writeValueToLds(false, output, ldsOffset, builder);
     }
 
     break;
   }
   case BuiltInTessLevelOuter:
   case BuiltInTessLevelInner: {
-    auto relativeId = m_pipelineSysValues.get(m_entryPoint)->getRelativeId();
+    if ((builtInId == BuiltInTessLevelOuter && builtInUsage.tessLevelOuter) ||
+        (builtInId == BuiltInTessLevelInner && builtInUsage.tessLevelInner)) {
+      unsigned loc = perPatchBuiltInOutLocMap.find(builtInId)->second;
+
+      if (outputTy->isArrayTy()) {
+        // Handle the whole array
+        for (unsigned i = 0; i < outputTy->getArrayNumElements(); ++i) {
+          auto ldsOffset = calcLdsOffsetForTcsOutput(outputTy, loc, nullptr, builder.getInt32(i), nullptr, builder);
+          auto elem = builder.CreateExtractValue(output, {i});
+          writeValueToLds(false, elem, ldsOffset, builder);
+        }
+      } else {
+        // Handle a single element of the array
+        auto ldsOffset = calcLdsOffsetForTcsOutput(outputTy, loc, nullptr, elemIdx, nullptr, builder);
+        writeValueToLds(false, output, ldsOffset, builder);
+      }
+    }
 
-    // tessLevelOuter (float[4]) + tessLevelInner (float[2])
-    // ldsOffset = relativeId * MaxTessFactorsPerPatch + elemIdx
+    // Write TFs to the dedicated region of on-chip LDS for later HW TF buffer store (read by HW tessellator)
+    unsigned numOuterTfs = 0;
+    unsigned numInnerTfs = 0;
+    unsigned numTfs = 0;
+
+    const auto primitiveMode = m_pipelineState->getShaderModes()->getTessellationMode().primitiveMode;
+    switch (primitiveMode) {
+    case PrimitiveMode::Triangles:
+      numOuterTfs = 3;
+      numInnerTfs = 1;
+      break;
+    case PrimitiveMode::Quads:
+      numOuterTfs = 4;
+      numInnerTfs = 2;
+      break;
+    case PrimitiveMode::Isolines:
+      numOuterTfs = 2;
+      numInnerTfs = 0;
+      break;
+    default:
+      llvm_unreachable("Unknown primitive mode!");
+      break;
+    }
+    numTfs = (builtInId == BuiltInTessLevelOuter) ? numOuterTfs : numInnerTfs;
+
+    auto relPatchId = m_pipelineSysValues.get(m_entryPoint)->getRelativeId();
+
+    // tessLevelOuter (numOuterTfs) + tessLevelInner (numInnerTfs)
+    // ldsOffset = tessFactorStart + relPatchId * tessFactorStride + elemIdx
     uint32_t tessOffset = 0;
     if (builtInId == BuiltInTessLevelInner)
-      tessOffset += 4;
+      tessOffset += numOuterTfs;
+
+    Value *baseOffset = builder.CreateMul(relPatchId, builder.getInt32(hwConfig.onChip.tessFactorStride));
+    baseOffset = builder.CreateAdd(baseOffset, builder.getInt32(hwConfig.onChip.tessFactorStart));
 
-    // Write tessellation factors to on-chip LDS for later TF buffer store
-    Value *baseOffset = builder.CreateMul(relativeId, builder.getInt32(MaxTessFactorsPerPatch));
     if (outputTy->isArrayTy()) {
-      // Handle the whole tessLevelOuter array
-      for (unsigned i = 0; i < outputTy->getArrayNumElements(); ++i) {
+      // Handle the whole array, skip irrelevant TFs
+      for (unsigned i = 0; i < numTfs; ++i) {
         Value *ldsOffset = builder.CreateAdd(baseOffset, builder.getInt32(tessOffset + i));
         auto elem = builder.CreateExtractValue(output, {i});
         writeValueToLds(false, elem, ldsOffset, builder);
       }
     } else {
-      // Handle a single element of tessLevelOuter array
+      // Handle a single element of the array
       Value *ldsOffset = builder.CreateAdd(baseOffset, builder.getInt32(tessOffset));
-      ldsOffset = builder.CreateAdd(ldsOffset, elemIdx);
-      writeValueToLds(false, output, ldsOffset, builder);
-    }
-
-    // Write tessellation factors for TES to read if needed
-    if (perPatchBuiltInOutLocMap.find(builtInId) != perPatchBuiltInOutLocMap.end()) {
-      unsigned loc = perPatchBuiltInOutLocMap.find(builtInId)->second;
-
-      if (outputTy->isArrayTy()) {
-        // Handle the whole tessLevelOuter array
-        for (unsigned i = 0; i < outputTy->getArrayNumElements(); ++i) {
-          auto ldsOffset = calcLdsOffsetForTcsOutput(outputTy, loc, nullptr, builder.getInt32(i), nullptr, builder);
-          auto elem = builder.CreateExtractValue(output, {i});
-          writeValueToLds(true, elem, ldsOffset, builder);
+      if (isa<ConstantInt>(elemIdx)) {
+        // Skip irrelevant TFs
+        if (cast<ConstantInt>(elemIdx)->getZExtValue() < numTfs) {
+          ldsOffset = builder.CreateAdd(ldsOffset, elemIdx);
+          writeValueToLds(false, output, ldsOffset, builder);
         }
       } else {
-        // Handle a single element of tessLevelOuter array
-        auto ldsOffset = calcLdsOffsetForTcsOutput(outputTy, loc, nullptr, elemIdx, nullptr, builder);
-        writeValueToLds(true, output, ldsOffset, builder);
+        // NOTE: We use odd-dword stride to avoid LDS bank conflict. Since the number of TFs is always even, the last
+        // TF slot is unused. We can reuse it to store irrelevant TFs.
+        assert(numOuterTfs + numInnerTfs + 1 == hwConfig.onChip.tessFactorStride);
+        unsigned invalidElemIdx = hwConfig.onChip.tessFactorStride - 1;
+
+        // elemIdx = elemIdx < numTfs ? elemIdx : invalidElemIdx
+        auto relevantTf = builder.CreateICmpULT(elemIdx, builder.getInt32(numTfs));
+        elemIdx = builder.CreateSelect(relevantTf, elemIdx, builder.getInt32(invalidElemIdx));
+        ldsOffset = builder.CreateAdd(ldsOffset, elemIdx);
+        writeValueToLds(false, output, ldsOffset, builder);
       }
     }
 
@@ -3757,47 +3810,21 @@ void LowerInOut::storeValueToStreamOutBuffer(Value *storeValue, unsigned xfbBuff
          m_shaderStage == ShaderStage::CopyShader);
   assert(xfbBuffer < MaxTransformFeedbackBuffers);
 
-  if (m_pipelineState->enableSwXfb()) {
-    // NOTE: For GFX11+, exporting transform feedback outputs is represented by a call and the call is
-    // replaced with real instructions when when NGG primitive shader is generated.
-    std::string callName = lgcName::NggXfbExport + getTypeName(storeValue->getType());
-    builder.CreateNamedCall(
-        callName, builder.getVoidTy(),
-        {builder.getInt32(xfbBuffer), builder.getInt32(xfbOffset), builder.getInt32(streamId), storeValue}, {});
-    return;
-  }
-
-  // NOTE: SW XFB must have been handled. Here we only handle HW XFB on pre-GFX11 generations.
-  assert(m_gfxIp.major == 10);
-
   auto storeTy = storeValue->getType();
+  assert(storeTy->getScalarSizeInBits() == 32); // Must be 32-bit type
 
   unsigned compCount = storeTy->isVectorTy() ? cast<FixedVectorType>(storeTy)->getNumElements() : 1;
   assert(compCount <= 4);
 
-  const uint64_t bitWidth = storeTy->getScalarSizeInBits();
-  assert(bitWidth == 16 || bitWidth == 32);
-
-  if (storeTy->isIntOrIntVectorTy(16)) {
-    Type *newStoreTy = compCount > 1 ? FixedVectorType::get(builder.getHalfTy(), compCount) : builder.getHalfTy();
-    storeValue = builder.CreateBitCast(storeValue, newStoreTy);
-    storeTy = newStoreTy;
-  }
-
-  // NOTE: For 16vec3, HW doesn't have a corresponding buffer store instruction. We have to split it to 16vec2 and
-  // 16scalar.
-  if (bitWidth == 16 && compCount == 3) {
-    // 16vec3 -> 16vec2 + 16scalar
-    Value *compX2 = builder.CreateShuffleVector(storeValue, {0, 1});
-    storeValueToStreamOutBuffer(compX2, xfbBuffer, xfbOffset, xfbStride, streamId, builder);
-
-    Value *comp = builder.CreateExtractElement(storeValue, 2);
-    xfbOffset += 2 * (bitWidth / 8);
-    storeValueToStreamOutBuffer(comp, xfbBuffer, xfbOffset, xfbStride, streamId, builder);
-
+  if (m_pipelineState->getNggControl()->enableNgg) {
+    assert(m_pipelineState->enableSwXfb());
+    builder.create<WriteXfbOutputOp>(xfbBuffer, xfbOffset, streamId, storeValue);
     return;
   }
 
+  // NOTE: SW XFB must have been handled. Here we only handle HW XFB on pre-GFX11 generations.
+  assert(m_gfxIp.major == 10);
+
   Value *streamInfo = nullptr;
   Value *writeIndex = nullptr;
   Value *streamOffset = nullptr;
@@ -3847,13 +3874,13 @@ void LowerInOut::storeValueToStreamOutBuffer(Value *storeValue, unsigned xfbBuff
   // writeIndex += threadId
   writeIndex = builder.CreateAdd(writeIndex, m_threadId);
 
-  static unsigned char formatTable[4][2] = {
-      {BUF_FORMAT_16_FLOAT, BUF_FORMAT_32_FLOAT},
-      {BUF_FORMAT_16_16_FLOAT, BUF_FORMAT_32_32_FLOAT_GFX10},
-      {BUF_FORMAT_INVALID, BUF_FORMAT_32_32_32_FLOAT_GFX10},
-      {BUF_FORMAT_16_16_16_16_FLOAT_GFX10, BUF_FORMAT_32_32_32_32_FLOAT_GFX10},
+  static unsigned char formatTable[] = {
+      BUF_FORMAT_32_FLOAT,
+      BUF_FORMAT_32_32_FLOAT_GFX10,
+      BUF_FORMAT_32_32_32_FLOAT_GFX10,
+      BUF_FORMAT_32_32_32_32_FLOAT_GFX10,
   };
-  unsigned format = formatTable[compCount - 1][bitWidth == 32];
+  unsigned format = formatTable[compCount - 1];
 
   CoherentFlag coherent = {};
   coherent.bits.glc = true;
@@ -4000,16 +4027,12 @@ void LowerInOut::storeValueToGsVsRing(Value *storeValue, unsigned location, unsi
   assert((elemTy->isFloatingPointTy() || elemTy->isIntegerTy()) && (bitWidth == 8 || bitWidth == 16 || bitWidth == 32));
 
   if (m_pipelineState->getNggControl()->enableNgg) {
-    // NOTE: For NGG, writing GS output to GS-VS ring is represented by a call and the call is replaced with
-    // real instructions when when NGG primitive shader is generated.
-    Value *args[] = {builder.getInt32(location), builder.getInt32(compIdx), builder.getInt32(streamId), storeValue};
-    std::string callName = lgcName::NggWriteGsOutput + getTypeName(storeTy);
-    builder.CreateNamedCall(callName, Type::getVoidTy(*m_context), args, {});
+    builder.create<NggWriteGsOutputOp>(location, compIdx, streamId, storeValue);
     return;
   }
 
   // NOTE: NGG with GS must have been handled. Here we only handle pre-GFX11 generations.
-  assert(m_pipelineState->getTargetInfo().getGfxIpVersion().major < 11);
+  assert(m_gfxIp.major < 11);
 
   if (storeTy->isArrayTy() || storeTy->isVectorTy()) {
     const unsigned elemCount = storeTy->isArrayTy() ? cast<ArrayType>(storeTy)->getNumElements()
@@ -4072,16 +4095,6 @@ void LowerInOut::storeValueToGsVsRing(Value *storeValue, unsigned location, unsi
       // NOTE: Here we use tbuffer_store instruction instead of buffer_store because we have to do explicit
       // control of soffset. This is required by swizzle enabled mode when address range checking should be
       // complied with.
-      unsigned format;
-      if (m_gfxIp.major <= 9) {
-        CombineFormat combineFormat = {};
-        combineFormat.bits.dfmt = BUF_DATA_FORMAT_32;
-        combineFormat.bits.nfmt = BUF_NUM_FORMAT_UINT;
-        format = combineFormat.u32All;
-      } else {
-        format = BUF_FORMAT_32_UINT;
-      }
-
       CoherentFlag coherent = {};
       coherent.bits.glc = true;
       coherent.bits.slc = true;
@@ -4092,7 +4105,7 @@ void LowerInOut::storeValueToGsVsRing(Value *storeValue, unsigned location, unsi
           m_pipelineSysValues.get(m_entryPoint)->getGsVsRingBufDesc(streamId), // rsrc
           ringOffset,                                                          // voffset
           gsVsOffset,                                                          // soffset
-          builder.getInt32(format),
+          builder.getInt32(BUF_FORMAT_32_UINT),
           builder.getInt32(coherent.u32All) // glc, slc, swz
       };
       builder.CreateIntrinsic(builder.getVoidTy(), Intrinsic::amdgcn_raw_tbuffer_store, args);
@@ -4112,9 +4125,9 @@ Value *LowerInOut::calcEsGsRingOffsetForOutput(unsigned location, unsigned compI
   // ES -> GS ring is always on-chip on GFX10+
   // ringOffset = esGsOffset + threadId * esGsRingItemSize + location * 4 + compIdx
   assert(m_pipelineState->hasShaderStage(ShaderStage::Geometry));
-  const auto &calcFactor = m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.calcFactor;
+  const auto &hwConfig = m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.hwConfig;
 
-  Value *ringOffset = builder.CreateMul(m_threadId, builder.getInt32(calcFactor.esGsRingItemSize));
+  Value *ringOffset = builder.CreateMul(m_threadId, builder.getInt32(hwConfig.esGsRingItemSize));
   ringOffset = builder.CreateAdd(ringOffset, esGsOffset);
   ringOffset = builder.CreateAdd(ringOffset, builder.getInt32(location * 4 + compIdx));
 
@@ -4132,7 +4145,7 @@ Value *LowerInOut::calcEsGsRingOffsetForInput(unsigned location, unsigned compId
                                               BuilderBase &builder) {
   // ES -> GS ring is always on-chip on GFX10+
   assert(m_pipelineState->hasShaderStage(ShaderStage::Geometry));
-  const auto &calcFactor = m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.calcFactor;
+  const auto &hwConfig = m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.hwConfig;
 
   auto esGsOffsets = m_pipelineSysValues.get(m_entryPoint)->getEsGsOffsets();
   const auto &geometryMode = m_pipelineState->getShaderModes()->getGeometryShaderMode();
@@ -4153,7 +4166,7 @@ Value *LowerInOut::calcEsGsRingOffsetForInput(unsigned location, unsigned compId
     // +-----------------+-----------------+-----+-------------------+
     // |<-------------------------- Patch -------------------------->|
     //
-    vertexOffset = builder.CreateMul(vertexIdx, builder.getInt32(calcFactor.esGsRingItemSize));
+    vertexOffset = builder.CreateMul(vertexIdx, builder.getInt32(hwConfig.esGsRingItemSize));
     vertexOffset = builder.CreateAdd(builder.CreateExtractElement(esGsOffsets, static_cast<uint64_t>(0)), vertexOffset);
   } else {
     // vertexOffset = esGsOffsets[vertexIdx] (vertexIdx < 6)
@@ -4184,7 +4197,7 @@ Value *LowerInOut::calcGsVsRingOffsetForOutput(unsigned location, unsigned compI
   unsigned streamBase = 0;
   for (int i = 0; i < MaxGsStreams; ++i) {
     streamBases[i] = streamBase;
-    streamBase += (resUsage->inOutUsage.gs.calcFactor.gsVsVertexItemSize[i] *
+    streamBase += (resUsage->inOutUsage.gs.hwConfig.gsVsVertexItemSize[i] *
                    m_pipelineState->getShaderModes()->getGeometryShaderMode().outputVertices);
   }
 
@@ -4194,15 +4207,15 @@ Value *LowerInOut::calcGsVsRingOffsetForOutput(unsigned location, unsigned compI
     //              threadId * gsVsRingItemSize +
     //              (vertexIdx * vertexSizePerStream) + location * 4 + compIdx + streamBase (in dwords)
 
-    auto esGsLdsSize = builder.getInt32(resUsage->inOutUsage.gs.calcFactor.esGsLdsSize);
+    auto esGsLdsSize = builder.getInt32(resUsage->inOutUsage.gs.hwConfig.esGsLdsSize);
 
     gsVsOffset = builder.CreateLShr(gsVsOffset, 2, "", /*isExact=*/true);
 
     auto ringItemOffset =
-        builder.CreateMul(m_threadId, builder.getInt32(resUsage->inOutUsage.gs.calcFactor.gsVsRingItemSize));
+        builder.CreateMul(m_threadId, builder.getInt32(resUsage->inOutUsage.gs.hwConfig.gsVsRingItemSize));
 
     // VertexSize is stream output vertexSize x 4 (in dwords)
-    unsigned vertexItemSize = resUsage->inOutUsage.gs.calcFactor.gsVsVertexItemSize[streamId];
+    unsigned vertexItemSize = resUsage->inOutUsage.gs.hwConfig.gsVsVertexItemSize[streamId];
     auto vertexItemOffset = builder.CreateMul(vertexIdx, builder.getInt32(vertexItemSize));
     ringOffset = builder.CreateAdd(esGsLdsSize, gsVsOffset);
     ringOffset = builder.CreateAdd(ringOffset, ringItemOffset);
@@ -4255,9 +4268,7 @@ Value *LowerInOut::readValueFromLds(bool offChip, Type *readTy, Value *ldsOffset
     ldsOffset = builder.CreateMul(ldsOffset, builder.getInt32(4));
 
     CoherentFlag coherent = {};
-    if (m_gfxIp.major <= 9)
-      coherent.bits.glc = true;
-    else if (m_gfxIp.major == 10) {
+    if (m_gfxIp.major == 10) {
       coherent.bits.glc = true;
       coherent.bits.dlc = true;
     } else if (m_gfxIp.major == 11) {
@@ -4396,11 +4407,11 @@ Value *LowerInOut::calcLdsOffsetForVsOutput(Type *outputTy, unsigned location, u
   const auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(ShaderStage::Vertex)->entryArgIdxs.vs;
   auto relVertexId = getFunctionArgument(m_entryPoint, entryArgIdxs.relVertexId);
 
-  const auto &calcFactor = m_pipelineState->getShaderResourceUsage(ShaderStage::TessControl)->inOutUsage.tcs.calcFactor;
-  auto vertexStride = builder.getInt32(calcFactor.inVertexStride);
-  // dwordOffset = inPatchStart + relVertexId * vertexStride + attribOffset
-  Value *ldsOffset = builder.getInt32(calcFactor.onChip.inPatchStart);
-  ldsOffset = builder.CreateAdd(ldsOffset, builder.CreateMul(relVertexId, vertexStride));
+  const auto &hwConfig = m_pipelineState->getShaderResourceUsage(ShaderStage::TessControl)->inOutUsage.tcs.hwConfig;
+  // dwordOffset = inputPatchStart + relVertexId * vertexStride + attribOffset
+  Value *ldsOffset = builder.getInt32(hwConfig.onChip.inputPatchStart);
+  ldsOffset =
+      builder.CreateAdd(ldsOffset, builder.CreateMul(relVertexId, builder.getInt32(hwConfig.onChip.inputVertexStride)));
   ldsOffset = builder.CreateAdd(ldsOffset, attribOffset);
 
   return ldsOffset;
@@ -4420,7 +4431,7 @@ Value *LowerInOut::calcLdsOffsetForTcsInput(Type *inputTy, unsigned location, Va
   assert(m_shaderStage == ShaderStage::TessControl);
 
   const auto &inOutUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::TessControl)->inOutUsage.tcs;
-  const auto &calcFactor = inOutUsage.calcFactor;
+  const auto &hwConfig = inOutUsage.hwConfig;
 
   // attribOffset = (location + locOffset) * 4 + compIdx
   Value *attribOffset = builder.getInt32(location);
@@ -4442,16 +4453,14 @@ Value *LowerInOut::calcLdsOffsetForTcsInput(Type *inputTy, unsigned location, Va
     attribOffset = builder.CreateAdd(attribOffset, compIdx);
   }
 
-  // dwordOffset = inPatchStart + (relativeId * inVertexCount + vertexId) * inVertexStride + attribOffset
-  auto inVertexCount = m_pipelineState->getNumPatchControlPoints();
-  auto inVertexCountVal = builder.getInt32(inVertexCount);
+  // dwordOffset = inputPatchStart + (relativeId * inputVertexCount + vertexIdx) * inputVertexStride + attribOffset
+  auto inputVertexCount = m_pipelineState->getNumPatchControlPoints();
   auto relativeId = m_pipelineSysValues.get(m_entryPoint)->getRelativeId();
-  Value *ldsOffset = builder.CreateMul(relativeId, inVertexCountVal);
+  Value *ldsOffset = builder.CreateMul(relativeId, builder.getInt32(inputVertexCount));
   ldsOffset = builder.CreateAdd(ldsOffset, vertexIdx);
-  auto inVertexStride = builder.getInt32(calcFactor.inVertexStride);
-  ldsOffset = builder.CreateMul(ldsOffset, inVertexStride);
+  ldsOffset = builder.CreateMul(ldsOffset, builder.getInt32(hwConfig.onChip.inputVertexStride));
   ldsOffset =
-      builder.CreateAdd(builder.getInt32(calcFactor.onChip.inPatchStart), builder.CreateAdd(ldsOffset, attribOffset));
+      builder.CreateAdd(builder.getInt32(hwConfig.onChip.inputPatchStart), builder.CreateAdd(ldsOffset, attribOffset));
 
   return ldsOffset;
 }
@@ -4467,13 +4476,12 @@ Value *LowerInOut::calcLdsOffsetForTcsInput(Type *inputTy, unsigned location, Va
 // @param builder : The IR builder to create and insert IR instruction
 Value *LowerInOut::calcLdsOffsetForTcsOutput(Type *outputTy, unsigned location, Value *locOffset, Value *compIdx,
                                              Value *vertexIdx, BuilderBase &builder) {
+  // NOTE: TCS outputs are always stored to on-chip LDS first. Then, they are store to off-chip LDS buffer (which will
+  // be loaded by TES).
   assert(m_shaderStage == ShaderStage::TessControl);
 
   const auto &inOutUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::TessControl)->inOutUsage.tcs;
-  const auto &calcFactor = inOutUsage.calcFactor;
-
-  auto outPatchStart = calcFactor.offChip.outPatchStart;
-  auto patchConstStart = calcFactor.offChip.patchConstStart;
+  const auto &hwConfig = inOutUsage.hwConfig;
 
   // attribOffset = (location + locOffset) * 4 + compIdx * bitWidth / 32
   Value *attribOffset = builder.getInt32(location);
@@ -4496,30 +4504,19 @@ Value *LowerInOut::calcLdsOffsetForTcsOutput(Type *outputTy, unsigned location,
   }
 
   Value *ldsOffset = nullptr;
-
-  const bool perPatch = (!vertexIdx); // Vertex indexing is unavailable for per-patch output
   auto relativeId = m_pipelineSysValues.get(m_entryPoint)->getRelativeId();
-  if (perPatch) {
-    // dwordOffset = patchConstStart + relativeId * patchConstSize + attribOffset
-    auto patchConstSize = builder.getInt32(calcFactor.patchConstSize);
-    ldsOffset = builder.CreateMul(relativeId, patchConstSize);
-
-    auto patchConstStartVal = builder.getInt32(patchConstStart);
-    ldsOffset = builder.CreateAdd(ldsOffset, patchConstStartVal);
-
+  if (vertexIdx) {
+    // dwordOffset = outputPatchStart + (relativeId * outputVertexCount + vertexIdx) * outputVertexStride + attribOffset
+    //             = outputPatchStart + relativeId * outputPatchSize + vertexIdx * outputVertexStride + attribOffset
+    ldsOffset = builder.CreateMul(relativeId, builder.getInt32(hwConfig.onChip.outputPatchSize));
+    ldsOffset = builder.CreateAdd(ldsOffset, builder.getInt32(hwConfig.onChip.outputPatchStart));
+    ldsOffset = builder.CreateAdd(ldsOffset,
+                                  builder.CreateMul(vertexIdx, builder.getInt32(hwConfig.onChip.outputVertexStride)));
     ldsOffset = builder.CreateAdd(ldsOffset, attribOffset);
   } else {
-    // dwordOffset = outPatchStart + (relativeId * outVertexCount + vertexId) * outVertexStride + attribOffset
-    //             = outPatchStart + relativeId * outPatchSize + vertexId  * outVertexStride + attribOffset
-    auto outPatchSize = builder.getInt32(calcFactor.outPatchSize);
-    ldsOffset = builder.CreateMul(relativeId, outPatchSize);
-
-    auto outPatchStartVal = builder.getInt32(outPatchStart);
-    ldsOffset = builder.CreateAdd(ldsOffset, outPatchStartVal);
-
-    auto outVertexStride = builder.getInt32(calcFactor.outVertexStride);
-    ldsOffset = builder.CreateAdd(ldsOffset, builder.CreateMul(vertexIdx, outVertexStride));
-
+    // dwordOffset = patchConstStart + relativeId * patchConstSize + attribOffset
+    ldsOffset = builder.CreateMul(relativeId, builder.getInt32(hwConfig.onChip.patchConstSize));
+    ldsOffset = builder.CreateAdd(ldsOffset, builder.getInt32(hwConfig.onChip.patchConstStart));
     ldsOffset = builder.CreateAdd(ldsOffset, attribOffset);
   }
 
@@ -4539,11 +4536,7 @@ Value *LowerInOut::calcLdsOffsetForTesInput(Type *inputTy, unsigned location, Va
                                             Value *vertexIdx, BuilderBase &builder) {
   assert(m_shaderStage == ShaderStage::TessEval);
 
-  const auto &calcFactor = m_pipelineState->getShaderResourceUsage(ShaderStage::TessControl)->inOutUsage.tcs.calcFactor;
-
-  auto outPatchStart = calcFactor.offChip.outPatchStart;
-  auto patchConstStart = calcFactor.offChip.patchConstStart;
-
+  const auto &hwConfig = m_pipelineState->getShaderResourceUsage(ShaderStage::TessControl)->inOutUsage.tcs.hwConfig;
   const auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(m_shaderStage.value())->entryArgIdxs.tes;
 
   auto relPatchId = getFunctionArgument(m_entryPoint, entryArgIdxs.relPatchId);
@@ -4569,29 +4562,18 @@ Value *LowerInOut::calcLdsOffsetForTesInput(Type *inputTy, unsigned location, Va
   }
 
   Value *ldsOffset = nullptr;
-
-  const bool perPatch = (!vertexIdx); // Vertex indexing is unavailable for per-patch input
-  if (perPatch) {
-    // dwordOffset = patchConstStart + relPatchId * patchConstSize + attribOffset
-    auto patchConstSize = builder.getInt32(calcFactor.patchConstSize);
-    ldsOffset = builder.CreateMul(relPatchId, patchConstSize);
-
-    auto patchConstStartVal = builder.getInt32(patchConstStart);
-    ldsOffset = builder.CreateAdd(ldsOffset, patchConstStartVal);
-
+  if (vertexIdx) {
+    // dwordOffset = patchStart + (relPatchId * vertexCount + vertexIdx) * vertexStride + attribOffset
+    //             = patchStart + relPatchId * patchSize + vertexIdx * vertexStride + attribOffset
+    ldsOffset = builder.CreateMul(relPatchId, builder.getInt32(hwConfig.offChip.outputPatchSize));
+    ldsOffset = builder.CreateAdd(ldsOffset, builder.getInt32(hwConfig.offChip.outputPatchStart));
+    ldsOffset = builder.CreateAdd(ldsOffset,
+                                  builder.CreateMul(vertexIdx, builder.getInt32(hwConfig.offChip.outputVertexStride)));
     ldsOffset = builder.CreateAdd(ldsOffset, attribOffset);
   } else {
-    // dwordOffset = patchStart + (relPatchId * vertexCount + vertexId) * vertexStride + attribOffset
-    //             = patchStart + relPatchId * patchSize + vertexId  * vertexStride + attribOffset
-    auto patchSize = builder.getInt32(calcFactor.outPatchSize);
-    ldsOffset = builder.CreateMul(relPatchId, patchSize);
-
-    auto patchStart = builder.getInt32(outPatchStart);
-    ldsOffset = builder.CreateAdd(ldsOffset, patchStart);
-
-    auto vertexStride = builder.getInt32(calcFactor.outVertexStride);
-    ldsOffset = builder.CreateAdd(ldsOffset, builder.CreateMul(vertexIdx, vertexStride));
-
+    // dwordOffset = patchConstStart + relPatchId * patchConstSize + attribOffset
+    ldsOffset = builder.CreateMul(relPatchId, builder.getInt32(hwConfig.offChip.patchConstSize));
+    ldsOffset = builder.CreateAdd(ldsOffset, builder.getInt32(hwConfig.offChip.patchConstStart));
     ldsOffset = builder.CreateAdd(ldsOffset, attribOffset);
   }
 
@@ -4599,18 +4581,17 @@ Value *LowerInOut::calcLdsOffsetForTesInput(Type *inputTy, unsigned location, Va
 }
 
 // =====================================================================================================================
-// Calculates the patch count for per-thread group.
+// Calculates maximum number of HS patches per thread group.
 //
-// @param inVertexCount : Count of vertices of input patch
-// @param inVertexStride : Vertex stride of input patch in (dwords)
-// @param outVertexCount : Count of vertices of output patch
-// @param outVertexStride : Vertex stride of output patch in (dwords)
-// @param patchConstCount : Count of output patch constants
-// @param tessFactorStride : Stride of tessellation factors (dwords)
-unsigned LowerInOut::calcPatchCountPerThreadGroup(unsigned inVertexCount, unsigned inVertexStride,
-                                                  unsigned outVertexCount, unsigned outVertexStride,
-                                                  unsigned patchConstCount, unsigned tessFactorStride) const {
-  unsigned maxThreadCountPerThreadGroup = MaxHsThreadsPerSubgroup;
+// @param inputVertexCount : Count of vertices of input patch
+// @param outputVertexCount : Count of vertices of output patch
+// @param tessFactorCount : Count of tessellation factors
+// @param ldsSizePerPatch : On-chip LDS size per patch (in dwords)
+// @param ldsBufferSizePerPatch : Off-chip LDS buffer size per patch (in dwords)
+unsigned LowerInOut::calcMaxNumPatchesPerGroup(unsigned inputVertexCount, unsigned outputVertexCount,
+                                               unsigned tessFactorCount, unsigned ldsSizePerPatch,
+                                               unsigned ldsBufferSizePerPatch) const {
+  unsigned maxNumThreadsPerGroup = MaxHsThreadsPerSubgroup;
 
   // NOTE: If ray query uses LDS stack, the expected max thread count in the group is 64. And we force wave size
   // to be 64 in order to keep all threads in the same wave. In the future, we could consider to get rid of this
@@ -4619,46 +4600,38 @@ unsigned LowerInOut::calcPatchCountPerThreadGroup(unsigned inVertexCount, unsign
   const auto vsResUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::Vertex);
   const auto tcsResUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::TessControl);
   if (vsResUsage->useRayQueryLdsStack || tcsResUsage->useRayQueryLdsStack) {
-    maxThreadCountPerThreadGroup = std::min(MaxRayQueryThreadsPerGroup, maxThreadCountPerThreadGroup);
+    maxNumThreadsPerGroup = std::min(MaxRayQueryThreadsPerGroup, maxNumThreadsPerGroup);
     rayQueryLdsStackSize = MaxRayQueryLdsStackEntries * MaxRayQueryThreadsPerGroup;
   }
 
-  const unsigned maxThreadCountPerPatch = std::max(inVertexCount, outVertexCount);
-  const unsigned patchCountLimitedByThread = maxThreadCountPerThreadGroup / maxThreadCountPerPatch;
-
-  const unsigned inPatchSize = (inVertexCount * inVertexStride);
-  const unsigned outPatchSize = (outVertexCount * outVertexStride);
-  const unsigned patchConstSize = patchConstCount * 4;
-
-  // Compute the required LDS size per patch, always include the space for input patch and tess factor
-  unsigned ldsSizePerPatch = inPatchSize + MaxTessFactorsPerPatch;
+  const unsigned maxNumThreadsPerPatch = std::max(inputVertexCount, outputVertexCount);
+  const unsigned numPatchesLimitedByThread = maxNumThreadsPerGroup / maxNumThreadsPerPatch;
 
-  unsigned ldsSizePerThreadGroup = m_pipelineState->getTargetInfo().getGpuProperty().ldsSizePerThreadGroup;
+  unsigned ldsSizePerGroup = m_pipelineState->getTargetInfo().getGpuProperty().ldsSizePerThreadGroup;
   if (m_pipelineState->canOptimizeTessFactor()) {
     // NOTE: If we are going to optimize TF store, we need additional on-chip LDS size. The required size is
     // 2 dwords per HS wave (1 dword all-ones flag or 1 dword all-zeros flag) plus an extra dword to
     // count actual HS patches.
     assert(m_gfxIp.major >= 11);
     const unsigned maxNumHsWaves =
-        MaxHsThreadsPerSubgroup / m_pipelineState->getMergedShaderWaveSize(ShaderStage::TessControl);
-    ldsSizePerThreadGroup -= 1 + maxNumHsWaves * 2;
+        MaxHsThreadsPerSubgroup / m_pipelineState->getShaderWaveSize(ShaderStage::TessControl);
+    ldsSizePerGroup -= 1 + maxNumHsWaves * 2;
   }
-  ldsSizePerThreadGroup -= rayQueryLdsStackSize; // Exclude LDS space used as ray query stack
+  ldsSizePerGroup -= rayQueryLdsStackSize; // Exclude LDS space used as ray query stack
 
-  unsigned patchCountLimitedByLds = ldsSizePerThreadGroup / ldsSizePerPatch;
+  unsigned numPatchesLimitedByLds = ldsSizePerGroup / ldsSizePerPatch;
 
-  unsigned patchCountPerThreadGroup = std::min(patchCountLimitedByThread, patchCountLimitedByLds);
+  unsigned maxNumPatchesPerGroup = std::min(numPatchesLimitedByThread, numPatchesLimitedByLds);
 
-  // NOTE: Performance analysis shows that 16 patches per thread group is an optimal upper-bound. The value is only
-  // an experimental number. For GFX9. 64 is an optimal number instead.
-  const unsigned optimalPatchCountPerThreadGroup = 64;
+  // NOTE: Performance analysis shows that 16 patches per group is an optimal upper-bound. The value is only
+  // an experimental number.
+  const unsigned optimalNumPatchesPerGroup = 64;
+  maxNumPatchesPerGroup = std::min(maxNumPatchesPerGroup, optimalNumPatchesPerGroup);
 
-  patchCountPerThreadGroup = std::min(patchCountPerThreadGroup, optimalPatchCountPerThreadGroup);
-
-  auto outPatchLdsBufferSize = (outPatchSize + patchConstSize) * 4;
-  auto tessOffChipPatchCountPerThreadGroup =
-      m_pipelineState->getTargetInfo().getGpuProperty().tessOffChipLdsBufferSize / outPatchLdsBufferSize;
-  patchCountPerThreadGroup = std::min(patchCountPerThreadGroup, tessOffChipPatchCountPerThreadGroup);
+  unsigned outputPatchLdsBufferSize = ldsBufferSizePerPatch * sizeof(unsigned);
+  auto offChipNumHsPatchesPerGroup =
+      m_pipelineState->getTargetInfo().getGpuProperty().tessOffChipLdsBufferSize / outputPatchLdsBufferSize;
+  maxNumPatchesPerGroup = std::min(maxNumPatchesPerGroup, offChipNumHsPatchesPerGroup);
 
   // TF-Buffer-based limit for Patchers per Thread Group:
   // ---------------------------------------------------------------------------------------------
@@ -4667,22 +4640,22 @@ unsigned LowerInOut::calcPatchCountPerThreadGroup(unsigned inVertexCount, unsign
   // assume that one thread-group could at most utilize all of the TF Buffer.
   const unsigned tfBufferSizeInBytes =
       sizeof(unsigned) * m_pipelineState->getTargetInfo().getGpuProperty().tessFactorBufferSizePerSe;
-  unsigned tfBufferPatchCountLimit = tfBufferSizeInBytes / (tessFactorStride * sizeof(unsigned));
+  unsigned tfBufferNumPatchesLimit = tfBufferSizeInBytes / (tessFactorCount * sizeof(unsigned));
 
   const auto workarounds = &m_pipelineState->getTargetInfo().getGpuWorkarounds();
   if (workarounds->gfx10.waTessFactorBufferSizeLimitGeUtcl1Underflow) {
-    tfBufferPatchCountLimit /= 2;
+    tfBufferNumPatchesLimit /= 2;
   }
 
-  patchCountPerThreadGroup = std::min(patchCountPerThreadGroup, tfBufferPatchCountLimit);
+  maxNumPatchesPerGroup = std::min(maxNumPatchesPerGroup, tfBufferNumPatchesLimit);
 
   // For all-offchip tessellation, we need to write an additional 4-byte TCS control word to the TF buffer whenever
   // the patch-ID is zero.
-  const unsigned offChipTfBufferPatchCountLimit =
-      (tfBufferSizeInBytes - (patchCountPerThreadGroup * sizeof(unsigned))) / (tessFactorStride * sizeof(unsigned));
-  patchCountPerThreadGroup = std::min(patchCountPerThreadGroup, offChipTfBufferPatchCountLimit);
+  const unsigned offChipTfBufferNumPatchesLimit =
+      (tfBufferSizeInBytes - (maxNumPatchesPerGroup * sizeof(unsigned))) / (tessFactorCount * sizeof(unsigned));
+  maxNumPatchesPerGroup = std::min(maxNumPatchesPerGroup, offChipTfBufferNumPatchesLimit);
 
-  return patchCountPerThreadGroup;
+  return maxNumPatchesPerGroup;
 }
 
 // =====================================================================================================================
@@ -4764,7 +4737,7 @@ void LowerInOut::addExportInstForGenericOutput(Value *output, unsigned location,
       attribValues[i] = exportValues[i - startChannel];
 
     m_expLocs.insert(location);
-    recordVertexAttribExport(location, {attribValues[0], attribValues[1], attribValues[2], attribValues[3]});
+    recordVertexAttribute(location, {attribValues[0], attribValues[1], attribValues[2], attribValues[3]});
   } else {
     // We have to do exporting twice for this output
     assert(startChannel == 0); // Other values are disallowed according to GLSL spec
@@ -4775,10 +4748,10 @@ void LowerInOut::addExportInstForGenericOutput(Value *output, unsigned location,
       attribValues[i] = exportValues[i];
 
     m_expLocs.insert(location); // First export
-    recordVertexAttribExport(location, {attribValues[0], attribValues[1], attribValues[2], attribValues[3]});
+    recordVertexAttribute(location, {attribValues[0], attribValues[1], attribValues[2], attribValues[3]});
 
     m_expLocs.insert(location + 1); // Second export
-    recordVertexAttribExport(location + 1, {attribValues[4], attribValues[5], attribValues[6], attribValues[7]});
+    recordVertexAttribute(location + 1, {attribValues[4], attribValues[5], attribValues[6], attribValues[7]});
   }
 }
 
@@ -4793,38 +4766,15 @@ void LowerInOut::addExportInstForBuiltInOutput(Value *output, unsigned builtInId
 
   switch (builtInId) {
   case BuiltInPosition: {
-    Value *args[] = {
-        builder.getInt32(EXP_TARGET_POS_0), // tgt
-        builder.getInt32(0xF),              // en
-        nullptr,
-        nullptr,
-        nullptr,
-        nullptr,
-        builder.getInt1(false), // done
-        builder.getInt1(false)  // vm
-    };
-
-    // src0 ~ src3
-    for (unsigned i = 0; i < 4; ++i) {
-      auto compValue = builder.CreateExtractElement(output, builder.getInt32(i));
-      args[2 + i] = compValue;
-    }
+    SmallVector<Value *, 4> positions;
+    for (unsigned i = 0; i < 4; ++i)
+      positions.push_back(builder.CreateExtractElement(output, builder.getInt32(i)));
 
-    builder.CreateIntrinsic(builder.getVoidTy(), Intrinsic::amdgcn_exp, args);
+    exportPosition(0, positions, builder);
     break;
   }
   case BuiltInPointSize: {
-    Value *args[] = {
-        builder.getInt32(EXP_TARGET_POS_1), // tgt
-        builder.getInt32(0x1),              // en
-        output,                             // src0
-        poison,                             // src1
-        poison,                             // src2
-        poison,                             // src3
-        builder.getInt1(false),             // done
-        builder.getInt1(false)              // vm
-    };
-    builder.CreateIntrinsic(builder.getVoidTy(), Intrinsic::amdgcn_exp, args);
+    exportPosition(1, {output, poison, poison, poison}, builder);
     break;
   }
   case BuiltInPrimitiveShadingRate: {
@@ -4835,19 +4785,8 @@ void LowerInOut::addExportInstForBuiltInOutput(Value *output, unsigned builtInId
     break;
   }
   case BuiltInEdgeFlag: {
-    Value *edgeflag = builder.CreateBitCast(output, builder.getFloatTy());
-
-    Value *args[] = {
-        builder.getInt32(EXP_TARGET_POS_1),     // tgt
-        builder.getInt32(0x2),                  // en
-        PoisonValue::get(builder.getFloatTy()), // src1
-        edgeflag,                               // src0
-        PoisonValue::get(builder.getFloatTy()), // src2
-        PoisonValue::get(builder.getFloatTy()), // src3
-        builder.getInt1(false),                 // done
-        builder.getInt1(false)                  // vm
-    };
-    builder.CreateIntrinsic(builder.getVoidTy(), Intrinsic::amdgcn_exp, args);
+    Value *edgeFlag = builder.CreateBitCast(output, builder.getFloatTy());
+    exportPosition(1, {poison, edgeFlag, poison, poison}, builder);
     break;
   }
   default: {
@@ -5290,16 +5229,7 @@ void LowerInOut::exportShadingRate(Value *shadingRate, BuilderBase &builder) {
   }
 
   auto poison = PoisonValue::get(builder.getFloatTy());
-  // "Done" flag is valid for exporting position 0 ~ 3
-  builder.CreateIntrinsic(Intrinsic::amdgcn_exp, builder.getFloatTy(),
-                          {builder.getInt32(EXP_TARGET_POS_1), // tgt
-                           builder.getInt32(0x2),              // en
-                           poison,                             // src0
-                           hwShadingRate,                      // src1
-                           poison,                             // src2
-                           poison,                             // src3
-                           builder.getFalse(),                 // done
-                           builder.getFalse()});               // src0
+  exportPosition(1, {poison, hwShadingRate, poison, poison}, builder);
 }
 
 // =====================================================================================================================
@@ -5381,27 +5311,27 @@ Value *LowerInOut::getShadingRate(BuilderBase &builder) {
 }
 
 // =====================================================================================================================
-// Records export info of vertex attributes
+// Record export info of vertex attributes.
 //
-// @param location : Vertex attribute location
-// @param attribValues : Values of this vertex attribute to export
-void LowerInOut::recordVertexAttribExport(unsigned location, ArrayRef<Value *> attribValues) {
+// @param exportSlot : Export slot
+// @param exportValues : Values of this vertex attribute to export
+void LowerInOut::recordVertexAttribute(unsigned exportSlot, ArrayRef<Value *> exportValues) {
   assert(m_shaderStage == ShaderStage::Vertex || m_shaderStage == ShaderStage::TessEval ||
          m_shaderStage == ShaderStage::CopyShader); // Valid shader stages
-  assert(location <= MaxInOutLocCount);             // 32 attributes at most
-  assert(attribValues.size() == 4);                 // Must have 4 elements, corresponds to <4 x float>
+  assert(exportSlot <= MaxInOutLocCount);           // 32 attributes at most
+  assert(exportValues.size() == 4);                 // Must have 4 elements, corresponds to <4 x float>
 
   auto poison = PoisonValue::get(Type::getFloatTy(*m_context));
 
   // Vertex attribute not existing, insert a new one and initialize it
-  if (m_attribExports.count(location) == 0) {
+  if (m_attribExports.count(exportSlot) == 0) {
     for (unsigned i = 0; i < 4; ++i)
-      m_attribExports[location][i] = poison;
+      m_attribExports[exportSlot][i] = poison;
   }
 
   for (unsigned i = 0; i < 4; ++i) {
-    assert(attribValues[i]);
-    if (isa<UndefValue>(attribValues[i]) || isa<PoisonValue>(attribValues[i]))
+    assert(exportValues[i]);
+    if (isa<UndefValue>(exportValues[i]) || isa<PoisonValue>(exportValues[i]))
       continue; // Here, we only record new attribute values that are valid (not unspecified ones)
 
     // NOTE: The existing values must have been initialized to unspecified ones already. Overlapping is disallowed (see
@@ -5412,19 +5342,19 @@ void LowerInOut::recordVertexAttribExport(unsigned location, ArrayRef<Value *> a
     //   - Invalid:
     //       Existing: attrib0, <1.0, 2.0, 3.0, undef/poison>
     //       New:      attrib0, <undef/poison, undef/poison, 4.0, 5.0>
-    assert(isa<UndefValue>(m_attribExports[location][i]) || isa<PoisonValue>(m_attribExports[location][i]));
-    m_attribExports[location][i] = attribValues[i]; // Update values that are valid
+    assert(isa<UndefValue>(m_attribExports[exportSlot][i]) || isa<PoisonValue>(m_attribExports[exportSlot][i]));
+    m_attribExports[exportSlot][i] = exportValues[i]; // Update values that are valid
   }
 
   auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value())->inOutUsage;
-  inOutUsage.expCount = std::max(inOutUsage.expCount, location + 1); // Update export count
+  inOutUsage.expCount = std::max(inOutUsage.expCount, exportSlot + 1); // Update export count
 }
 
 // =====================================================================================================================
-// Exports vertex attributes that were recorded previously
+// Export vertex attributes that were recorded previously.
 //
-// @param builder : the builder to use
-void LowerInOut::exportVertexAttribs(BuilderBase &builder) {
+// @param builder : IR builder
+void LowerInOut::exportAttributes(BuilderBase &builder) {
   assert(m_shaderStage == ShaderStage::Vertex || m_shaderStage == ShaderStage::TessEval ||
          m_shaderStage == ShaderStage::CopyShader); // Valid shader stages
   if (m_attribExports.empty()) {
@@ -5433,33 +5363,116 @@ void LowerInOut::exportVertexAttribs(BuilderBase &builder) {
   }
 
   for (auto &attribExport : m_attribExports) {
-    if (m_pipelineState->exportAttributeByExportInstruction()) {
+    const auto &[exportSlot, exportValues] = attribExport;
+    assert(exportValues.size() == 4); // Must be <4 x float>
+
+    if (m_pipelineState->getNggControl()->enableNgg) {
+      builder.create<NggExportAttributeOp>(exportSlot, exportValues[0], exportValues[1], exportValues[2],
+                                           exportValues[3]);
+    } else {
       unsigned channelMask = 0;
       for (unsigned i = 0; i < 4; ++i) {
-        assert(attribExport.second[i]);
-        if (!isa<UndefValue>(attribExport.second[i]) && !isa<PoisonValue>(attribExport.second[i]))
+        assert(exportValues[i]);
+        if (!isa<UndefValue>(exportValues[i]) && !isa<PoisonValue>(exportValues[i]))
           channelMask |= (1u << i); // Update channel mask if the value is valid (not unspecified)
       }
 
       builder.CreateIntrinsic(Intrinsic::amdgcn_exp, builder.getFloatTy(),
-                              {builder.getInt32(EXP_TARGET_PARAM_0 + attribExport.first), // tgt
-                               builder.getInt32(channelMask),                             // en
-                               attribExport.second[0],                                    // src0
-                               attribExport.second[1],                                    // src1
-                               attribExport.second[2],                                    // src2
-                               attribExport.second[3],                                    // src3
-                               builder.getFalse(),                                        // done
-                               builder.getFalse()});                                      // src0
-    } else {
-      Value *attribValue = PoisonValue::get(FixedVectorType::get(builder.getFloatTy(), 4)); // Always be <4 x float>
-      for (unsigned i = 0; i < 4; ++i)
-        attribValue = builder.CreateInsertElement(attribValue, attribExport.second[i], i);
-      // NOTE: Create a call if we export vertex attribute through memory. This call will be expanded when NGG primitive
-      // shader is generated. The arguments are: attribute location, and attribute export value.
-      builder.CreateNamedCall(lgcName::NggAttributeThroughMemory, builder.getVoidTy(),
-                              {builder.getInt32(attribExport.first), attribValue}, {});
+                              {builder.getInt32(EXP_TARGET_PARAM_0 + exportSlot), // tgt
+                               builder.getInt32(channelMask),                     // en
+                               exportValues[0],                                   // src0
+                               exportValues[1],                                   // src1
+                               exportValues[2],                                   // src2
+                               exportValues[3],                                   // src3
+                               builder.getFalse(),                                // done
+                               builder.getFalse()});                              // vm
     }
   }
 }
 
+// =====================================================================================================================
+static Value *adjustIj(Value *value, Value *offset, BuilderImpl &builder) {
+  offset = builder.CreateFPExt(offset, FixedVectorType::get(builder.getFloatTy(), 2));
+  Value *offsetX = builder.CreateExtractElement(offset, uint64_t(0));
+  Value *offsetY = builder.CreateExtractElement(offset, 1);
+  if (auto vecTy = dyn_cast<FixedVectorType>(value->getType())) {
+    offsetX = builder.CreateVectorSplat(vecTy->getNumElements(), offsetX);
+    offsetY = builder.CreateVectorSplat(vecTy->getNumElements(), offsetY);
+  }
+  Value *derivX = builder.CreateDerivative(value, /*isY=*/false, /*isFine=*/true);
+  Value *derivY = builder.CreateDerivative(value, /*isY=*/true, /*isFine=*/true);
+  Value *adjustX = builder.CreateFAdd(value, builder.CreateFMul(derivX, offsetX));
+  Value *adjustY = builder.CreateFAdd(adjustX, builder.CreateFMul(derivY, offsetY));
+  return adjustY;
+}
+
+// =====================================================================================================================
+// Evaluate I,J for interpolation: center offset, smooth (perspective) version
+void LowerInOut::visitEvalIjOffsetSmoothOp(EvalIjOffsetSmoothOp &op) {
+  BuilderBase builderBase(&op);
+  // Get <I/W, J/W, 1/W>
+  Value *pullModel = patchFsBuiltInInputImport(FixedVectorType::get(builderBase.getFloatTy(), 3), BuiltInInterpPullMode,
+                                               nullptr, builderBase);
+  BuilderImpl builder(m_pipelineState);
+  builder.SetInsertPoint(builderBase.GetInsertPoint());
+  builder.setFastMathFlags(op.getFastMathFlags());
+  // Adjust each coefficient by offset.
+  Value *adjusted = adjustIj(pullModel, op.getValue(), builder);
+  // Extract <I/W, J/W, 1/W> part of that
+  Value *ijDivW = builder.CreateShuffleVector(adjusted, adjusted, ArrayRef<int>{0, 1});
+  Value *rcpW = builder.CreateExtractElement(adjusted, 2);
+  // Get W by making a reciprocal of 1/W
+  Value *w = builder.CreateFDiv(ConstantFP::get(builder.getFloatTy(), 1.0), rcpW);
+  w = builder.CreateVectorSplat(2, w);
+  auto res = builder.CreateFMul(ijDivW, w);
+
+  op.replaceAllUsesWith(res);
+  op.eraseFromParent();
+}
+
+// =====================================================================================================================
+// Adjusts value by its X and Y derivatives times the X and Y components of offset.
+void LowerInOut::visitAdjustIjOp(AdjustIjOp &op) {
+  BuilderImpl builder(m_pipelineState);
+  builder.SetInsertPoint(&op);
+  builder.setFastMathFlags(op.getFastMathFlags());
+  Value *adjusted = adjustIj(op.getValue(), op.getOffset(), builder);
+
+  op.replaceAllUsesWith(adjusted);
+  op.eraseFromParent();
+}
+
+// =====================================================================================================================
+// Export vertex position.
+//
+// @param exportSlot : Export slot
+// @param exportValues : Vertex position values to export
+// @param builder : IR builder
+void LowerInOut::exportPosition(unsigned exportSlot, ArrayRef<llvm::Value *> exportValues, BuilderBase &builder) {
+  assert(m_shaderStage == ShaderStage::Vertex || m_shaderStage == ShaderStage::TessEval ||
+         m_shaderStage == ShaderStage::CopyShader); // Valid shader stages
+  assert(exportValues.size() == 4);                 // Must be <4 x float>
+
+  if (m_pipelineState->getNggControl()->enableNgg) {
+    builder.create<NggExportPositionOp>(exportSlot, exportValues[0], exportValues[1], exportValues[2], exportValues[3]);
+  } else {
+    unsigned channelMask = 0;
+    for (unsigned i = 0; i < 4; ++i) {
+      assert(exportValues[i]);
+      if (!isa<UndefValue>(exportValues[i]) && !isa<PoisonValue>(exportValues[i]))
+        channelMask |= (1u << i); // Update channel mask if the value is valid (not unspecified)
+    }
+
+    builder.CreateIntrinsic(Intrinsic::amdgcn_exp, builder.getFloatTy(),
+                            {builder.getInt32(EXP_TARGET_POS_0 + exportSlot), // tgt
+                             builder.getInt32(channelMask),                   // en
+                             exportValues[0],                                 // src0
+                             exportValues[1],                                 // src1
+                             exportValues[2],                                 // src2
+                             exportValues[3],                                 // src3
+                             builder.getFalse(),                              // done
+                             builder.getFalse()});                            // vm
+  }
+}
+
 } // namespace lgc
diff --git a/lgc/patch/LowerInvariantLoads.cpp b/lgc/patch/LowerInvariantLoads.cpp
index 4ab3f3c05d..ddba756395 100644
--- a/lgc/patch/LowerInvariantLoads.cpp
+++ b/lgc/patch/LowerInvariantLoads.cpp
@@ -29,6 +29,7 @@
  ***********************************************************************************************************************
  */
 #include "lgc/patch/LowerInvariantLoads.h"
+#include "lgc/LgcDialect.h"
 #include "lgc/patch/LgcLowering.h"
 #include "lgc/state/PipelineState.h"
 #include "lgc/state/TargetInfo.h"
@@ -59,13 +60,13 @@ enum AddrSpaceBit {
 };
 
 static unsigned findAddressSpaceAccess(const Instruction *inst) {
-  if (const LoadInst *li = dyn_cast<LoadInst>(inst)) {
-    return std::min(li->getPointerAddressSpace(), UNKNOWN_ADDRESS_SPACE);
-  } else if (const StoreInst *si = dyn_cast<StoreInst>(inst)) {
-    return std::min(si->getPointerAddressSpace(), UNKNOWN_ADDRESS_SPACE);
+  if (const LoadInst *load = dyn_cast<LoadInst>(inst)) {
+    return std::min(load->getPointerAddressSpace(), UNKNOWN_ADDRESS_SPACE);
+  } else if (const StoreInst *store = dyn_cast<StoreInst>(inst)) {
+    return std::min(store->getPointerAddressSpace(), UNKNOWN_ADDRESS_SPACE);
   } else {
-    if (const CallInst *ci = dyn_cast<CallInst>(inst)) {
-      auto func = ci->getCalledFunction();
+    if (const CallInst *call = dyn_cast<CallInst>(inst)) {
+      auto func = call->getCalledFunction();
       if (func) {
         // Treat these as buffer address space as they do not overlap with private.
         if (func->getName().starts_with("llvm.amdgcn.image") || func->getName().starts_with("llvm.amdgcn.raw") ||
@@ -154,8 +155,8 @@ PreservedAnalyses LowerInvariantLoads::run(Function &function, FunctionAnalysisM
   for (BasicBlock &block : function) {
     for (Instruction &inst : block) {
       if (!clearInvariants && inst.mayWriteToMemory()) {
-        if (IntrinsicInst *ii = dyn_cast<IntrinsicInst>(&inst)) {
-          switch (ii->getIntrinsicID()) {
+        if (IntrinsicInst *intrinsic = dyn_cast<IntrinsicInst>(&inst)) {
+          switch (intrinsic->getIntrinsicID()) {
           case Intrinsic::amdgcn_exp:
           case Intrinsic::amdgcn_exp_compr:
           case Intrinsic::amdgcn_init_exec:
@@ -167,9 +168,9 @@ PreservedAnalyses LowerInvariantLoads::run(Function &function, FunctionAnalysisM
           default:
             break;
           }
-        } else if (CallInst *ci = dyn_cast<CallInst>(&inst)) {
-          auto func = ci->getCalledFunction();
-          if (func && func->getName().starts_with("lgc.ngg."))
+        } else if (CallInst *call = dyn_cast<CallInst>(&inst)) {
+          if (isa<NggWriteGsOutputOp>(call) || isa<NggExportPositionOp>(call) || isa<NggExportAttributeOp>(call) ||
+              isa<WriteXfbOutputOp>(call))
             continue;
         }
         unsigned addrSpace = findAddressSpaceAccess(&inst);
@@ -179,7 +180,8 @@ PreservedAnalyses LowerInvariantLoads::run(Function &function, FunctionAnalysisM
         }
         writtenAddrSpaces |= aliasMatrix[addrSpace];
       } else if (inst.mayReadFromMemory()) {
-        loads.push_back(&inst);
+        if (!isa<NggReadGsOutputOp>(inst))
+          loads.push_back(&inst);
       }
     }
   }
diff --git a/lgc/patch/MeshTaskShader.cpp b/lgc/patch/MeshTaskShader.cpp
index 91a593450b..6d1a284f3c 100644
--- a/lgc/patch/MeshTaskShader.cpp
+++ b/lgc/patch/MeshTaskShader.cpp
@@ -30,9 +30,9 @@
  */
 #include "MeshTaskShader.h"
 #include "ShaderMerger.h"
+#include "lgc/Debug.h"
 #include "lgc/patch/LgcLowering.h"
 #include "lgc/patch/MutateEntryPoint.h"
-#include "lgc/util/Debug.h"
 #include "lgc/util/WorkgroupLayout.h"
 #include "llvm-dialects/Dialect/Visitor.h"
 #include "llvm/IR/IRBuilder.h"
@@ -722,7 +722,7 @@ void MeshTaskShader::processMeshShader(Function *entryPoint) {
   // Force s_barrier to be present if necessary (ignore optimization)
   const unsigned numMeshThreads = meshMode.workgroupSizeX * meshMode.workgroupSizeY * meshMode.workgroupSizeZ;
   auto primAmpFactor =
-      m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.calcFactor.primAmpFactor;
+      m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.hwConfig.primAmpFactor;
   // If we enable row export, the actual thread group size is determined by work group size provided from API mesh
   // shader.
   const unsigned flatWorkgroupSize =
@@ -1253,7 +1253,7 @@ void MeshTaskShader::lowerTaskPayloadPtr(TaskPayloadPtrOp &taskPayloadPtrOp) {
   payloadRingBufDesc = m_builder.CreateInsertElement(payloadRingBufDesc, descWord1, 1);
 
   // Convert to fat pointer.
-  auto taskPayloadPtr = m_builder.create<BufferDescToPtrOp>(payloadRingBufDesc);
+  auto taskPayloadPtr = m_builder.create<BufferDescToPtrOp>(payloadRingBufDesc, true);
   taskPayloadPtrOp.replaceAllUsesWith(taskPayloadPtr);
 
   if (getShaderStage(entryPoint) == ShaderStage::Task)
@@ -3139,7 +3139,7 @@ bool MeshTaskShader::checkNeedBarrierFlag(Function *entryPoint) {
   const auto &meshMode = m_pipelineState->getShaderModes()->getMeshShaderMode();
   const unsigned numMeshThreads = meshMode.workgroupSizeX * meshMode.workgroupSizeY * meshMode.workgroupSizeZ;
   const unsigned numThreads =
-      m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.calcFactor.primAmpFactor;
+      m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.hwConfig.primAmpFactor;
   assert(numThreads >= numMeshThreads);
 
   const unsigned waveSize = m_pipelineState->getShaderWaveSize(ShaderStage::Mesh);
diff --git a/lgc/patch/MutateEntryPoint.cpp b/lgc/patch/MutateEntryPoint.cpp
index 20fa129daf..5968bda20f 100644
--- a/lgc/patch/MutateEntryPoint.cpp
+++ b/lgc/patch/MutateEntryPoint.cpp
@@ -556,8 +556,8 @@ bool MutateEntryPoint::lowerCpsOps(Function *func, ShaderInputs *shaderInputs) {
 
   // Lower returns.
   for (auto *ret : retInstrs) {
-    auto *cspTy = builder.getInt32Ty();
-    exitInfos.push_back(CpsExitInfo(ret->getParent(), {builder.getInt32(0), PoisonValue::get(cspTy)}));
+    auto *dummyI32 = PoisonValue::get(builder.getInt32Ty());
+    exitInfos.push_back(CpsExitInfo(ret->getParent(), {builder.getInt32(0), dummyI32, dummyI32}));
     builder.SetInsertPoint(ret);
     builder.CreateBr(tailBlock);
     ret->eraseFromParent();
@@ -568,7 +568,7 @@ bool MutateEntryPoint::lowerCpsOps(Function *func, ShaderInputs *shaderInputs) {
     vgprNum = std::max(exit.vgpr.size(), vgprNum);
 
   SmallVector<Value *> newVgpr;
-  // Put LocalInvocationId before {vcr, csp}.
+  // Put LocalInvocationId before {vcr, csp, shaderIndex}.
   if (haveLocalInvocationId)
     newVgpr.push_back(func->getArg(numUserdata));
 
@@ -578,8 +578,9 @@ bool MutateEntryPoint::lowerCpsOps(Function *func, ShaderInputs *shaderInputs) {
     newVgpr.append(exitInfos[0].vgpr);
   } else {
     for (size_t vgprIdx = 0; vgprIdx < vgprNum; vgprIdx++) {
-      // We always have the leading two fixed vgpr arguments: vcr, csp. The other remaining payloads are i32 type.
-      Type *phiTy = vgprIdx < 2 ? exitInfos[0].vgpr[vgprIdx]->getType() : builder.getInt32Ty();
+      // We always have the leading three fixed vgpr arguments: csp, shaderIndex, vcr. The other remaining payloads are
+      // i32 type.
+      Type *phiTy = vgprIdx < 3 ? exitInfos[0].vgpr[vgprIdx]->getType() : builder.getInt32Ty();
       PHINode *phi = builder.CreatePHI(phiTy, exitInfos.size());
       for (size_t exitIdx = 0; exitIdx < exitInfos.size(); exitIdx++) {
         if (vgprIdx < exitInfos[exitIdx].vgpr.size())
@@ -832,10 +833,11 @@ void MutateEntryPoint::lowerCpsJump(Function *parent, cps::JumpOp *jumpOp, Basic
   // Add extra args specific to the target function.
   SmallVector<Value *> remainingArgs{jumpOp->getTail()};
 
-  // Packing VGPR arguments {vcr, csp, rcr, args...}
+  // Packing VGPR arguments {vcr, csp, shaderRecIdx, rcr, args...}
   SmallVector<Value *> vgprArgs;
   vgprArgs.push_back(jumpOp->getTarget());
   vgprArgs.push_back(jumpOp->getCsp());
+  vgprArgs.push_back(jumpOp->getShaderIndex());
   vgprArgs.push_back(jumpOp->getRcr());
   splitIntoI32(layout, builder, remainingArgs, vgprArgs);
 
@@ -971,6 +973,11 @@ void MutateEntryPoint::gatherUserDataUsage(Module *module) {
             userDataUsage->loads.push_back(load);
             userDataUsage->addLoad(load.dwordOffset, load.dwordSize);
           })
+          .add<WriteXfbOutputOp>([](MutateEntryPoint &self, WriteXfbOutputOp &op) {
+            auto lastVertexStage = self.m_pipelineState->getLastVertexProcessingStage();
+            lastVertexStage = lastVertexStage == ShaderStage::CopyShader ? ShaderStage::Geometry : lastVertexStage;
+            self.getUserDataUsage(lastVertexStage.value())->usesStreamOutTable = true;
+          })
           .build();
 
   visitor.visit(*this, *module);
@@ -990,16 +997,15 @@ void MutateEntryPoint::gatherUserDataUsage(Module *module) {
         specialUserData.resize(std::max(specialUserData.size(), size_t(index + 1)));
         specialUserData[index].users.push_back(call);
       }
-      continue;
     }
+  }
 
-    if ((func.getName().starts_with(lgcName::OutputExportXfb) && !func.use_empty()) || m_pipelineState->enableSwXfb()) {
-      // NOTE: For GFX11+, SW emulated stream-out will always use stream-out buffer descriptors and stream-out buffer
-      // offsets to calculate numbers of written primitives/dwords and update the counters.  auto lastVertexStage =
-      auto lastVertexStage = m_pipelineState->getLastVertexProcessingStage();
-      lastVertexStage = lastVertexStage == ShaderStage::CopyShader ? ShaderStage::Geometry : lastVertexStage;
-      getUserDataUsage(lastVertexStage.value())->usesStreamOutTable = true;
-    }
+  if (m_pipelineState->enableSwXfb()) {
+    // NOTE: For GFX11+, SW emulated stream-out will always use stream-out buffer descriptors and stream-out buffer
+    // offsets to calculate numbers of written primitives/dwords and update the counters.
+    auto lastVertexStage = m_pipelineState->getLastVertexProcessingStage();
+    lastVertexStage = lastVertexStage == ShaderStage::CopyShader ? ShaderStage::Geometry : lastVertexStage;
+    getUserDataUsage(lastVertexStage.value())->usesStreamOutTable = true;
   }
 }
 
diff --git a/lgc/patch/NggPrimShader.cpp b/lgc/patch/NggPrimShader.cpp
index 0bc7de86ab..4070e47d31 100644
--- a/lgc/patch/NggPrimShader.cpp
+++ b/lgc/patch/NggPrimShader.cpp
@@ -30,9 +30,11 @@
  */
 #include "NggPrimShader.h"
 #include "ShaderMerger.h"
+#include "lgc/Debug.h"
+#include "lgc/LgcDialect.h"
 #include "lgc/patch/LgcLowering.h"
 #include "lgc/state/PalMetadata.h"
-#include "lgc/util/Debug.h"
+#include "llvm-dialects/Dialect/Visitor.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/InlineAsm.h"
@@ -133,15 +135,14 @@ NggPrimShader::NggPrimShader(PipelineState *pipelineState)
     unsigned vertexItemSizes[MaxGsStreams] = {};
     auto resUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry);
     for (unsigned i = 0; i < MaxGsStreams; ++i)
-      vertexItemSizes[i] = resUsage->inOutUsage.gs.calcFactor.gsVsVertexItemSize[i];
+      vertexItemSizes[i] = resUsage->inOutUsage.gs.hwConfig.gsVsVertexItemSize[i];
 
     unsigned gsVsRingItemSizes[MaxGsStreams] = {};
     const auto &geometryMode = m_pipelineState->getShaderModes()->getGeometryShaderMode();
     for (unsigned i = 0; i < MaxGsStreams; ++i)
       gsVsRingItemSizes[i] = vertexItemSizes[i] * geometryMode.outputVertices;
 
-    const unsigned gsPrimsPerSubgroup =
-        resUsage->inOutUsage.gs.calcFactor.gsPrimsPerSubgroup * geometryMode.invocations;
+    const unsigned gsPrimsPerSubgroup = resUsage->inOutUsage.gs.hwConfig.gsPrimsPerSubgroup * geometryMode.invocations;
     unsigned gsStreamBase = 0;
     for (unsigned i = 0; i < MaxGsStreams; ++i) {
       m_gsStreamBases[i] = gsStreamBase;
@@ -197,7 +198,7 @@ PrimShaderLdsUsageInfo NggPrimShader::layoutPrimShaderLds(PipelineState *pipelin
                                                           PrimShaderLdsLayout *ldsLayout) {
   assert(pipelineState->getNggControl()->enableNgg); // Must enable NGG
 
-  const auto &calcFactor = pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.calcFactor;
+  const auto &hwConfig = pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.hwConfig;
 
   unsigned ldsOffset = 0;     // In dwords
   unsigned ldsRegionSize = 0; // In dwords
@@ -247,7 +248,7 @@ PrimShaderLdsUsageInfo NggPrimShader::layoutPrimShaderLds(PipelineState *pipelin
 
     // ES-GS ring
     if (ldsLayout) {
-      ldsRegionSize = calcFactor.esGsLdsSize;
+      ldsRegionSize = hwConfig.esGsLdsSize;
 
       printLdsRegionInfo("ES-GS Ring", ldsOffset, ldsRegionSize);
       (*ldsLayout)[PrimShaderLdsRegion::EsGsRing] = std::make_pair(ldsOffset, ldsRegionSize);
@@ -343,7 +344,7 @@ PrimShaderLdsUsageInfo NggPrimShader::layoutPrimShaderLds(PipelineState *pipelin
     // GS-VS ring
     if (ldsLayout) {
       const unsigned esGsRingLdsSize = (*ldsLayout)[PrimShaderLdsRegion::EsGsRing].second;
-      ldsRegionSize = calcFactor.gsOnChipLdsSize - esGsRingLdsSize - ldsUsageInfo.gsExtraLdsSize;
+      ldsRegionSize = hwConfig.gsOnChipLdsSize - esGsRingLdsSize - ldsUsageInfo.gsExtraLdsSize;
 
       printLdsRegionInfo("GS-VS Ring", ldsOffset, ldsRegionSize);
       (*ldsLayout)[PrimShaderLdsRegion::GsVsRing] = std::make_pair(ldsOffset, ldsRegionSize);
@@ -394,7 +395,7 @@ PrimShaderLdsUsageInfo NggPrimShader::layoutPrimShaderLds(PipelineState *pipelin
     // Distributed primitive ID
     if (distributePrimitiveId) {
       if (ldsLayout) {
-        ldsRegionSize = calcFactor.esVertsPerSubgroup; // 1 dword per vertex thread
+        ldsRegionSize = hwConfig.esVertsPerSubgroup; // 1 dword per vertex thread
 
         printLdsRegionInfo("Distributed Primitive ID", ldsOffset, ldsRegionSize);
         (*ldsLayout)[PrimShaderLdsRegion::DistributedPrimitiveId] = std::make_pair(ldsOffset, ldsRegionSize);
@@ -407,8 +408,8 @@ PrimShaderLdsUsageInfo NggPrimShader::layoutPrimShaderLds(PipelineState *pipelin
     // XFB outputs
     if (pipelineState->enableSwXfb()) {
       if (ldsLayout) {
-        ldsRegionSize = calcFactor.esVertsPerSubgroup *
-                        calcFactor.esGsRingItemSize; // Transform feedback outputs are stored as a ES-GS ring item
+        ldsRegionSize = hwConfig.esVertsPerSubgroup *
+                        hwConfig.esGsRingItemSize; // Transform feedback outputs are stored as a ES-GS ring item
 
         printLdsRegionInfo("XFB Outputs", ldsOffset, ldsRegionSize);
         (*ldsLayout)[PrimShaderLdsRegion::XfbOutput] = std::make_pair(ldsOffset, ldsRegionSize);
@@ -462,7 +463,7 @@ PrimShaderLdsUsageInfo NggPrimShader::layoutPrimShaderLds(PipelineState *pipelin
   // Distributed primitive ID
   if (distributePrimitiveId) {
     if (ldsLayout) {
-      ldsRegionSize = calcFactor.esVertsPerSubgroup; // 1 dword per vertex thread
+      ldsRegionSize = hwConfig.esVertsPerSubgroup; // 1 dword per vertex thread
 
       printLdsRegionInfo("Distributed Primitive ID", ldsOffset, ldsRegionSize);
       (*ldsLayout)[PrimShaderLdsRegion::DistributedPrimitiveId] = std::make_pair(ldsOffset, ldsRegionSize);
@@ -484,7 +485,7 @@ PrimShaderLdsUsageInfo NggPrimShader::layoutPrimShaderLds(PipelineState *pipelin
   // Vertex cull info
   if (ldsLayout) {
     ldsRegionSize =
-        calcFactor.esGsRingItemSize * calcFactor.esVertsPerSubgroup; // Vertex cull info is stored as a ES-GS ring item
+        hwConfig.esGsRingItemSize * hwConfig.esVertsPerSubgroup; // Vertex cull info is stored as a ES-GS ring item
 
     printLdsRegionInfo("Vertex Cull Info", ldsOffset, ldsRegionSize);
     (*ldsLayout)[PrimShaderLdsRegion::VertexCullInfo] = std::make_pair(ldsOffset, ldsRegionSize);
@@ -769,26 +770,24 @@ unsigned NggPrimShader::calcVertexCullInfoSizeAndOffsets(PipelineState *pipeline
 unsigned NggPrimShader::calcEsXfbOutputsSize(Function *esMain) {
   unsigned xfbOutputsSize = 0;
 
-  for (auto &func : esMain->getParent()->functions()) {
-    if (!func.getName().starts_with(lgcName::OutputExportXfb) && !func.getName().starts_with(lgcName::NggXfbExport))
-      continue;
-
-    for (auto user : func.users()) {
-      CallInst *const call = dyn_cast<CallInst>(user);
-      assert(call);
-
-      if (call->getFunction() != esMain)
-        continue;
-
-      auto xfbOutput = call->getArgOperand(call->arg_size() - 1);
-
-      Type *xfbOutputTy = xfbOutput->getType();
-      unsigned xfbOutputSize = xfbOutputTy->isVectorTy() ? cast<FixedVectorType>(xfbOutputTy)->getNumElements() : 1;
-      if (xfbOutputTy->getScalarSizeInBits() == 64)
-        xfbOutputSize *= 2; // Double it
-      xfbOutputsSize += xfbOutputSize;
-    }
-  }
+  struct Payload {
+    unsigned &xfbOutputsSize;
+  };
+  Payload payload = {xfbOutputsSize};
+
+  static const auto visitor = llvm_dialects::VisitorBuilder<Payload>()
+                                  .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration)
+                                  .add<WriteXfbOutputOp>([](Payload &payload, WriteXfbOutputOp &writeXfbOutputOp) {
+                                    Type *xfbOutputTy = writeXfbOutputOp.getOutputValue()->getType();
+                                    unsigned xfbOutputSize = xfbOutputTy->isVectorTy()
+                                                                 ? cast<FixedVectorType>(xfbOutputTy)->getNumElements()
+                                                                 : 1;
+                                    if (xfbOutputTy->getScalarSizeInBits() == 64)
+                                      xfbOutputSize *= 2; // Double it
+                                    payload.xfbOutputsSize += xfbOutputSize;
+                                  })
+                                  .build();
+  visitor.visit(payload, *esMain);
 
   return xfbOutputsSize;
 }
@@ -1285,7 +1284,7 @@ void NggPrimShader::buildPrimShader(Function *primShader) {
   const unsigned dummyExportCount = waNggCullingNoEmptySubgroups ? 1 : 0;
 
   const unsigned esGsRingItemSize =
-      m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.calcFactor.esGsRingItemSize;
+      m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.hwConfig.esGsRingItemSize;
 
   // NOTE: Make sure vertex position data is 4-dword alignment because we will use 128-bit LDS read/write for it.
   assert(getLdsRegionStart(PrimShaderLdsRegion::VertexPosition) % 4U == 0);
@@ -1910,6 +1909,7 @@ void NggPrimShader::buildPrimShaderWithGs(Function *primShader) {
   const bool cullingMode = !m_nggControl->passthroughMode;
 
   const auto rasterStream = m_pipelineState->getRasterizerState().rasterStream;
+  const bool noRasterization = rasterStream == InvalidValue;
 
   SmallVector<Argument *, 32> args;
   for (auto &arg : primShader->args())
@@ -2010,34 +2010,70 @@ void NggPrimShader::buildPrimShaderWithGs(Function *primShader) {
   auto beginGsBlock = createBlock(primShader, ".beginGs");
   auto endGsBlock = createBlock(primShader, ".endGs");
 
-  auto initVertexCountsBlock = createBlock(primShader, ".initVertexCounts");
-  auto endInitVertexCountsBlock = createBlock(primShader, ".endInitVertexCounts");
+  BasicBlock *initVertexCountsBlock = nullptr;
+  BasicBlock *endInitVertexCountsBlock = nullptr;
 
-  auto cullPrimitiveBlock = createBlock(primShader, ".cullPrimitive");
-  auto nullifyPrimitiveDataBlock = createBlock(primShader, ".nullifyPrimitiveData");
-  auto endCullPrimitiveBlock = createBlock(primShader, ".endCullPrimitive");
+  BasicBlock *cullPrimitiveBlock = nullptr;
+  BasicBlock *nullifyPrimitiveDataBlock = nullptr;
+  BasicBlock *endCullPrimitiveBlock = nullptr;
 
-  auto checkVertexDrawFlagBlock = createBlock(primShader, ".checkVertexDrawFlag");
-  auto endCheckVertexDrawFlagBlock = createBlock(primShader, ".endCheckVertexDrawFlag");
+  BasicBlock *checkVertexDrawFlagBlock = nullptr;
+  BasicBlock *endCheckVertexDrawFlagBlock = nullptr;
 
-  auto accumVertexCountsBlock = createBlock(primShader, ".accumVertexCounts");
-  auto endAccumVertexCountsBlock = createBlock(primShader, ".endAccumVertexCounts");
+  BasicBlock *accumVertexCountsBlock = nullptr;
+  BasicBlock *endAccumVertexCountsBlock = nullptr;
 
-  auto compactVertexIndexBlock = createBlock(primShader, ".compactVertexIndex");
-  auto endCompactVertexIndexBlock = createBlock(primShader, ".endCompactVertexIndex");
+  BasicBlock *compactVertexIndexBlock = nullptr;
+  BasicBlock *endCompactVertexIndexBlock = nullptr;
 
-  auto sendGsAllocReqBlock = createBlock(primShader, ".sendGsAllocReq");
-  auto endSendGsAllocReqBlock = createBlock(primShader, ".endSendGsAllocReq");
+  BasicBlock *sendGsAllocReqBlock = nullptr;
+  BasicBlock *endSendGsAllocReqBlock = nullptr;
 
-  auto exportPrimitiveBlock = createBlock(primShader, ".exportPrimitive");
-  auto endExportPrimitiveBlock = createBlock(primShader, ".endExportPrimitive");
+  BasicBlock *exportPrimitiveBlock = nullptr;
+  BasicBlock *endExportPrimitiveBlock = nullptr;
 
-  auto checkEmptyWaveBlock = createBlock(primShader, ".checkEmptyWave");
-  auto dummyVertexExportBlock = createBlock(primShader, ".dummyVertexExport");
-  auto checkExportVertexBlock = createBlock(primShader, ".checkExportVertex");
+  BasicBlock *checkEmptyWaveBlock = nullptr;
+  BasicBlock *dummyVertexExportBlock = nullptr;
+  BasicBlock *checkExportVertexBlock = nullptr;
 
-  auto exportVertexBlock = createBlock(primShader, ".exportVertex");
-  auto endExportVertexBlock = createBlock(primShader, ".endExportVertex");
+  BasicBlock *exportVertexBlock = nullptr;
+  BasicBlock *endExportVertexBlock = nullptr;
+
+  if (noRasterization) {
+    // NOTE: For the case of no rasterization (DX-specific), primitive/vertex exports could be completely ignored.
+    // We just send message GS_ALLOC_REQ to tell HW we don't have any primitive/vertex to export.
+    sendGsAllocReqBlock = createBlock(primShader, ".sendGsAllocReq");
+    endSendGsAllocReqBlock = createBlock(primShader, ".endSendGsAllocReq");
+  } else {
+    initVertexCountsBlock = createBlock(primShader, ".initVertexCounts");
+    endInitVertexCountsBlock = createBlock(primShader, ".endInitVertexCounts");
+
+    cullPrimitiveBlock = createBlock(primShader, ".cullPrimitive");
+    nullifyPrimitiveDataBlock = createBlock(primShader, ".nullifyPrimitiveData");
+    endCullPrimitiveBlock = createBlock(primShader, ".endCullPrimitive");
+
+    checkVertexDrawFlagBlock = createBlock(primShader, ".checkVertexDrawFlag");
+    endCheckVertexDrawFlagBlock = createBlock(primShader, ".endCheckVertexDrawFlag");
+
+    accumVertexCountsBlock = createBlock(primShader, ".accumVertexCounts");
+    endAccumVertexCountsBlock = createBlock(primShader, ".endAccumVertexCounts");
+
+    compactVertexIndexBlock = createBlock(primShader, ".compactVertexIndex");
+    endCompactVertexIndexBlock = createBlock(primShader, ".endCompactVertexIndex");
+
+    sendGsAllocReqBlock = createBlock(primShader, ".sendGsAllocReq");
+    endSendGsAllocReqBlock = createBlock(primShader, ".endSendGsAllocReq");
+
+    exportPrimitiveBlock = createBlock(primShader, ".exportPrimitive");
+    endExportPrimitiveBlock = createBlock(primShader, ".endExportPrimitive");
+
+    checkEmptyWaveBlock = createBlock(primShader, ".checkEmptyWave");
+    dummyVertexExportBlock = createBlock(primShader, ".dummyVertexExport");
+    checkExportVertexBlock = createBlock(primShader, ".checkExportVertex");
+
+    exportVertexBlock = createBlock(primShader, ".exportVertex");
+    endExportVertexBlock = createBlock(primShader, ".endExportVertex");
+  }
 
   // Construct ".entry" block
   {
@@ -2120,11 +2156,44 @@ void NggPrimShader::buildPrimShaderWithGs(Function *primShader) {
     else if (m_pipelineState->enablePrimStats())
       collectPrimitiveStats();
 
-    auto validWave =
-        m_builder.CreateICmpULT(m_nggInputs.threadIdInSubgroup, m_builder.getInt32(m_maxWavesPerSubgroup + 1));
-    m_builder.CreateCondBr(validWave, initVertexCountsBlock, endInitVertexCountsBlock);
+    if (noRasterization) {
+      auto firstWaveInSubgroup = m_builder.CreateICmpEQ(m_nggInputs.waveIdInSubgroup, m_builder.getInt32(0));
+      m_builder.CreateCondBr(firstWaveInSubgroup, sendGsAllocReqBlock, endSendGsAllocReqBlock);
+    } else {
+      auto validWave =
+          m_builder.CreateICmpULT(m_nggInputs.threadIdInSubgroup, m_builder.getInt32(m_maxWavesPerSubgroup + 1));
+      m_builder.CreateCondBr(validWave, initVertexCountsBlock, endInitVertexCountsBlock);
+    }
+  }
+
+  // NOTE: Here, we handle the case of no rasterization (DX-specific). In such case, primitive/vertex exports could be
+  // completely ignored.
+  if (noRasterization) {
+    // Construct ".sendGsAllocReq" block
+    {
+      m_builder.SetInsertPoint(sendGsAllocReqBlock);
+
+      // Clear primitive/vertex count
+      m_nggInputs.primCountInSubgroup = m_builder.getInt32(0);
+      m_nggInputs.vertCountInSubgroup = m_builder.getInt32(0);
+
+      sendGsAllocReqMessage();
+      m_builder.CreateBr(endSendGsAllocReqBlock);
+    }
+
+    // Construct ".endSendGsAllocReq" block
+    {
+      m_builder.SetInsertPoint(endSendGsAllocReqBlock);
+
+      m_builder.CreateRetVoid(); // Early return for no rasterization case
+    }
+
+    return;
   }
 
+  // The rasterization stream must be specified now
+  assert(rasterStream != InvalidValue);
+
   // Construct ".initVertexCounts" block
   {
     m_builder.SetInsertPoint(initVertexCountsBlock);
@@ -2484,7 +2553,7 @@ void NggPrimShader::buildPrimShaderWithGs(Function *primShader) {
 }
 
 // =====================================================================================================================
-// Extracts merged group/wave info and initializes part of NGG calculation factors.
+// Extracts merged group/wave info and initializes part of NGG inputs.
 //
 // NOTE: This function must be invoked by the entry block of NGG shader module.
 //
@@ -2886,7 +2955,7 @@ void NggPrimShader::exportPrimitive(Value *primitiveCulled) {
       m_builder.SetInsertPoint(compactVertexIndexBlock);
 
       const unsigned esGsRingItemSize =
-          m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.calcFactor.esGsRingItemSize;
+          m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.hwConfig.esGsRingItemSize;
 
       auto vertexItemOffset0 = m_builder.CreateMul(m_nggInputs.vertexIndex0, m_builder.getInt32(esGsRingItemSize));
       auto vertexItemOffset1 = m_builder.CreateMul(m_nggInputs.vertexIndex1, m_builder.getInt32(esGsRingItemSize));
@@ -2979,6 +3048,7 @@ void NggPrimShader::exportPrimitiveWithGs(Value *startingVertexIndex) {
   //   Export primitive
   //
   const auto rasterStream = m_pipelineState->getRasterizerState().rasterStream;
+  assert(rasterStream != InvalidValue);
   Value *primData =
       readPerThreadDataFromLds(m_builder.getInt32Ty(), m_nggInputs.threadIdInSubgroup,
                                PrimShaderLdsRegion::PrimitiveData, m_maxThreadsPerSubgroup * rasterStream);
@@ -3155,17 +3225,18 @@ void NggPrimShader::runEs(ArrayRef<Argument *> args) {
     return;
   }
 
-  if (!m_pipelineState->exportAttributeByExportInstruction()) {
-    if (!m_hasGs) // For GS, ATM is done in copy shader
-      exportVertexAttributeThroughMemory(m_esHandlers.main);
+  if (!m_hasGs) {
+    // For GS, vertex export is done in copy shader
+    IRBuilder<>::InsertPointGuard guard(m_builder);
+    mutateToExportVertex(m_esHandlers.main);
   }
 
   Value *esGsOffset = nullptr;
   if (m_hasGs) {
-    auto &calcFactor = m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.calcFactor;
+    auto &hwConfig = m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.hwConfig;
     unsigned waveSize = m_pipelineState->getShaderWaveSize(ShaderStage::Geometry);
     esGsOffset =
-        m_builder.CreateMul(m_nggInputs.waveIdInSubgroup, m_builder.getInt32(waveSize * calcFactor.esGsRingItemSize));
+        m_builder.CreateMul(m_nggInputs.waveIdInSubgroup, m_builder.getInt32(waveSize * hwConfig.esGsRingItemSize));
   }
 
   Value *offChipLdsBase = args[ShaderMerger::getSpecialSgprInputIndex(m_gfxIp, EsGs::OffChipLdsBase)];
@@ -3310,7 +3381,7 @@ Value *NggPrimShader::runPartEs(ArrayRef<Argument *> args, Value *position) {
       m_builder.SetInsertPoint(uncompactVertexBlock);
 
       const unsigned esGsRingItemSize =
-          m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.calcFactor.esGsRingItemSize;
+          m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.hwConfig.esGsRingItemSize;
 
       auto uncompactedVertexIndex = readPerThreadDataFromLds(m_builder.getInt32Ty(), m_nggInputs.threadIdInSubgroup,
                                                              PrimShaderLdsRegion::VertexIndexMap);
@@ -3437,24 +3508,10 @@ Value *NggPrimShader::runPartEs(ArrayRef<Argument *> args, Value *position) {
 void NggPrimShader::splitEs() {
   assert(m_hasGs == false); // GS must not be present
 
-  //
-  // Collect all export calls for further analysis
-  //
-  SmallVector<Function *, 8> expFuncs;
-  for (auto &func : m_esHandlers.main->getParent()->functions()) {
-    if (func.isIntrinsic() && func.getIntrinsicID() == Intrinsic::amdgcn_exp)
-      expFuncs.push_back(&func);
-    else if (m_gfxIp.major >= 11) {
-      if (func.getName().starts_with(lgcName::NggAttributeThroughMemory) ||
-          func.getName().starts_with(lgcName::NggXfbExport))
-        expFuncs.push_back(&func);
-    }
-  }
-
   //
   // Preparation for fetching cull distances
   //
-  unsigned clipCullPos = EXP_TARGET_POS_1;
+  unsigned clipCullExportSlot = 1;
   unsigned clipDistanceCount = 0;
   unsigned cullDistanceCount = 0;
 
@@ -3466,7 +3523,7 @@ void NggPrimShader::splitEs() {
       const auto &builtInUsage = resUsage->builtInUsage.tes;
 
       bool miscExport = builtInUsage.pointSize || builtInUsage.layer || builtInUsage.viewportIndex;
-      clipCullPos = miscExport ? EXP_TARGET_POS_2 : EXP_TARGET_POS_1;
+      clipCullExportSlot = miscExport ? 2 : 1;
       clipDistanceCount = builtInUsage.clipDistance;
       cullDistanceCount = builtInUsage.cullDistance;
     } else {
@@ -3474,7 +3531,7 @@ void NggPrimShader::splitEs() {
 
       bool miscExport = builtInUsage.pointSize || builtInUsage.layer || builtInUsage.viewportIndex;
       miscExport |= builtInUsage.primitiveShadingRate;
-      clipCullPos = miscExport ? EXP_TARGET_POS_2 : EXP_TARGET_POS_1;
+      clipCullExportSlot = miscExport ? 2 : 1;
       clipDistanceCount = builtInUsage.clipDistance;
       cullDistanceCount = builtInUsage.cullDistance;
     }
@@ -3525,49 +3582,66 @@ void NggPrimShader::splitEs() {
   IRBuilder<>::InsertPointGuard guard(m_builder);
   m_builder.SetInsertPoint(retBlock);
 
-  SmallVector<CallInst *, 8> removedCalls;
+  SmallVector<CallInst *, 8> callsToRemove;
 
   // Fetch position and cull distances
   Value *position = PoisonValue::get(positionTy);
   SmallVector<Value *, MaxClipCullDistanceCount> clipCullDistance(MaxClipCullDistanceCount);
 
-  for (auto func : expFuncs) {
-    for (auto user : func->users()) {
-      CallInst *const call = cast<CallInst>(user);
-
-      if (call->getParent()->getParent() != esCullDataFetcher)
-        continue; // Export call doesn't belong to targeted function, skip
-
-      assert(call->getParent() == retBlock); // Must in return block
-
-      if (func->isIntrinsic() && func->getIntrinsicID() == Intrinsic::amdgcn_exp) {
-        unsigned exportTarget = cast<ConstantInt>(call->getArgOperand(0))->getZExtValue();
-        if (exportTarget == EXP_TARGET_POS_0) {
-          // Get position value
-          m_constPositionZ = isa<Constant>(call->getArgOperand(4));
-          for (unsigned i = 0; i < 4; ++i)
-            position = m_builder.CreateInsertElement(position, call->getArgOperand(2 + i), i);
-        } else if (exportTarget == clipCullPos) {
-          // Get clip/cull distance value
-          if (m_nggControl->enableCullDistanceCulling) {
-            clipCullDistance[0] = call->getArgOperand(2);
-            clipCullDistance[1] = call->getArgOperand(3);
-            clipCullDistance[2] = call->getArgOperand(4);
-            clipCullDistance[3] = call->getArgOperand(5);
-          }
-        } else if (exportTarget == clipCullPos + 1 && clipDistanceCount + cullDistanceCount > 4) {
-          // Get clip/cull distance value
-          if (m_nggControl->enableCullDistanceCulling) {
-            clipCullDistance[4] = call->getArgOperand(2);
-            clipCullDistance[5] = call->getArgOperand(3);
-            clipCullDistance[6] = call->getArgOperand(4);
-            clipCullDistance[7] = call->getArgOperand(5);
-          }
-        }
-      }
-
-      removedCalls.push_back(call); // Remove export
-    }
+  {
+    struct Payload {
+      NggPrimShader &self;
+      const unsigned clipCullExportSlot;
+      const unsigned clipDistanceCount;
+      const unsigned cullDistanceCount;
+      Value *&position;
+      SmallVectorImpl<Value *> &clipCullDistance;
+      SmallVectorImpl<CallInst *> &callsToRemove;
+    };
+    Payload payload = {*this,    clipCullExportSlot, clipDistanceCount, cullDistanceCount,
+                       position, clipCullDistance,   callsToRemove};
+
+    static const auto visitor =
+        llvm_dialects::VisitorBuilder<Payload>()
+            .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration)
+            .add<NggExportPositionOp>([](Payload &payload, NggExportPositionOp &exportPositionOp) {
+              auto &builder = payload.self.m_builder;
+
+              auto exportSlot = exportPositionOp.getExportSlot();
+              if (exportSlot == 0) {
+                // Get position value
+                payload.self.m_constPositionZ = isa<Constant>(exportPositionOp.getExportValue2());
+                payload.position = builder.CreateInsertElement(payload.position, exportPositionOp.getExportValue0(),
+                                                               static_cast<uint64_t>(0));
+                payload.position = builder.CreateInsertElement(payload.position, exportPositionOp.getExportValue1(), 1);
+                payload.position = builder.CreateInsertElement(payload.position, exportPositionOp.getExportValue2(), 2);
+                payload.position = builder.CreateInsertElement(payload.position, exportPositionOp.getExportValue3(), 3);
+              } else if (exportSlot == payload.clipCullExportSlot) {
+                // Get clip/cull distance value
+                if (payload.self.m_nggControl->enableCullDistanceCulling) {
+                  payload.clipCullDistance[0] = exportPositionOp.getExportValue0();
+                  payload.clipCullDistance[1] = exportPositionOp.getExportValue1();
+                  payload.clipCullDistance[2] = exportPositionOp.getExportValue2();
+                  payload.clipCullDistance[3] = exportPositionOp.getExportValue3();
+                }
+              } else if (exportSlot == payload.clipCullExportSlot + 1 &&
+                         payload.clipDistanceCount + payload.cullDistanceCount > 4) {
+                // Get clip/cull distance value
+                if (payload.self.m_nggControl->enableCullDistanceCulling) {
+                  payload.clipCullDistance[4] = exportPositionOp.getExportValue0();
+                  payload.clipCullDistance[5] = exportPositionOp.getExportValue1();
+                  payload.clipCullDistance[6] = exportPositionOp.getExportValue2();
+                  payload.clipCullDistance[7] = exportPositionOp.getExportValue3();
+                }
+              }
+
+              payload.callsToRemove.push_back(&exportPositionOp);
+            })
+            .add<NggExportAttributeOp>([](Payload &payload, NggExportAttributeOp &exportAttributeOp) {
+              payload.callsToRemove.push_back(&exportAttributeOp);
+            })
+            .build();
+    visitor.visit(payload, *esCullDataFetcher);
   }
 
   Value *cullData = position;
@@ -3596,29 +3670,34 @@ void NggPrimShader::splitEs() {
   position = esVertexExporter->getArg(0); // The first argument is vertex position data
   assert(position->getType() == positionTy);
 
-  for (auto func : expFuncs) {
-    for (auto user : func->users()) {
-      CallInst *const call = cast<CallInst>(user);
-
-      if (call->getParent()->getParent() != esVertexExporter)
-        continue; // Export call doesn't belong to targeted function, skip
-
-      if (func->isIntrinsic() && func->getIntrinsicID() == Intrinsic::amdgcn_exp) {
-        unsigned exportTarget = cast<ConstantInt>(call->getArgOperand(0))->getZExtValue();
-        if (exportTarget == EXP_TARGET_POS_0) {
-          // Replace vertex position data
-          m_builder.SetInsertPoint(call);
-          call->setArgOperand(2, m_builder.CreateExtractElement(position, static_cast<uint64_t>(0)));
-          call->setArgOperand(3, m_builder.CreateExtractElement(position, 1));
-          call->setArgOperand(4, m_builder.CreateExtractElement(position, 2));
-          call->setArgOperand(5, m_builder.CreateExtractElement(position, 3));
-        }
-      }
-    }
-  }
-
-  if (!m_pipelineState->exportAttributeByExportInstruction())
-    exportVertexAttributeThroughMemory(esVertexExporter);
+  {
+    struct Payload {
+      NggPrimShader &self;
+      Value *position;
+    };
+    Payload payload = {*this, position};
+
+    static const auto visitor =
+        llvm_dialects::VisitorBuilder<Payload>()
+            .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration)
+            .add<NggExportPositionOp>([](Payload &payload, NggExportPositionOp &exportPositionOp) {
+              auto &builder = payload.self.m_builder;
+              builder.SetInsertPoint(&exportPositionOp);
+
+              if (exportPositionOp.getExportSlot() == 0) {
+                // Replace vertex position data
+                exportPositionOp.setExportValue0(
+                    builder.CreateExtractElement(payload.position, static_cast<uint64_t>(0)));
+                exportPositionOp.setExportValue1(builder.CreateExtractElement(payload.position, 1));
+                exportPositionOp.setExportValue2(builder.CreateExtractElement(payload.position, 2));
+                exportPositionOp.setExportValue3(builder.CreateExtractElement(payload.position, 3));
+              }
+            })
+            .build();
+    visitor.visit(payload, *esVertexExporter);
+  }
+
+  mutateToExportVertex(esVertexExporter);
 
   // Remove original ES since it is no longer needed
   assert(m_esHandlers.main->use_empty());
@@ -3629,8 +3708,7 @@ void NggPrimShader::splitEs() {
   m_esHandlers.cullDataFetcher = esCullDataFetcher;
   m_esHandlers.vertexExporter = esVertexExporter;
 
-  // Remove calls
-  for (auto call : removedCalls) {
+  for (auto call : callsToRemove) {
     call->dropAllReferences();
     call->eraseFromParent();
   }
@@ -3715,7 +3793,7 @@ void NggPrimShader::mutateGs() {
 
   IRBuilder<>::InsertPointGuard guard(m_builder);
 
-  SmallVector<Instruction *, 32> removedCalls;
+  SmallVector<CallInst *, 32> callsToRemove;
 
   m_builder.SetInsertPointPastAllocas(m_gsHandlers.main);
 
@@ -3761,62 +3839,62 @@ void NggPrimShader::mutateGs() {
   auto threadIdInSubgroup = m_builder.CreateMul(waveId, m_builder.getInt32(waveSize));
   threadIdInSubgroup = m_builder.CreateAdd(threadIdInSubgroup, threadIdInWave);
 
-  // Handle GS message and GS output export
-  for (auto &func : m_gsHandlers.main->getParent()->functions()) {
-    if (func.getName().starts_with(lgcName::NggWriteGsOutput)) {
-      // Export GS outputs to GS-VS ring
-      for (auto user : func.users()) {
-        CallInst *const call = cast<CallInst>(user);
-        m_builder.SetInsertPoint(call);
-
-        assert(call->arg_size() == 4);
-        const unsigned location = cast<ConstantInt>(call->getOperand(0))->getZExtValue();
-        const unsigned compIdx = cast<ConstantInt>(call->getOperand(1))->getZExtValue();
-        const unsigned streamId = cast<ConstantInt>(call->getOperand(2))->getZExtValue();
-        assert(streamId < MaxGsStreams);
-        Value *output = call->getOperand(3);
-
-        auto emitVerts = m_builder.CreateLoad(m_builder.getInt32Ty(), emitVertsPtrs[streamId]);
-        auto totalEmitVerts = m_builder.CreateLoad(m_builder.getInt32Ty(), totalEmitVertsPtr);
-        writeGsOutput(output, location, compIdx, streamId, threadIdInSubgroup, emitVerts, totalEmitVerts);
-
-        removedCalls.push_back(call);
-      }
-    } else if (func.isIntrinsic() && func.getIntrinsicID() == Intrinsic::amdgcn_s_sendmsg) {
-      // Handle GS message
-      for (auto user : func.users()) {
-        CallInst *const call = cast<CallInst>(user);
-        m_builder.SetInsertPoint(call);
-
-        if (getShaderStage(call->getParent()->getParent()) != ShaderStage::Geometry)
-          continue; // Not belong to GS messages
-
-        uint64_t message = cast<ConstantInt>(call->getArgOperand(0))->getZExtValue();
-        if (message == GsEmitStream0 || message == GsEmitStream1 || message == GsEmitStream2 ||
-            message == GsEmitStream3) {
-          // Handle GS_EMIT, MSG[9:8] = STREAM_ID
-          unsigned streamId = (message & GsEmitCutStreamIdMask) >> GsEmitCutStreamIdShift;
-          assert(streamId < MaxGsStreams);
-          processGsEmit(streamId, threadIdInSubgroup, emitVertsPtrs[streamId], outVertsPtrs[streamId],
-                        totalEmitVertsPtr);
-        } else if (message == GsCutStream0 || message == GsCutStream1 || message == GsCutStream2 ||
-                   message == GsCutStream3) {
-          // Handle GS_CUT, MSG[9:8] = STREAM_ID
-          unsigned streamId = (message & GsEmitCutStreamIdMask) >> GsEmitCutStreamIdShift;
-          assert(streamId < MaxGsStreams);
-          processGsCut(streamId, outVertsPtrs[streamId]);
-        } else {
-          // Unexpected GS message
-          llvm_unreachable("Unexpected GS message!");
-        }
+  // Handle dialect op NggWriteGsOutputOp and GS message
+  struct Payload {
+    NggPrimShader &self;
+    const ArrayRef<Value *> emitVertsPtrs;
+    const ArrayRef<Value *> outVertsPtrs;
+    Value *totalEmitVertsPtr;
+    Value *threadIdInSubgroup;
+    SmallVectorImpl<CallInst *> &callsToRemove;
+  };
+  Payload payload = {*this, emitVertsPtrs, outVertsPtrs, totalEmitVertsPtr, threadIdInSubgroup, callsToRemove};
 
-        removedCalls.push_back(call);
-      }
-    }
-  }
+  static const auto visitor =
+      llvm_dialects::VisitorBuilder<Payload>()
+          .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration)
+          .add<NggWriteGsOutputOp>([](Payload &payload, NggWriteGsOutputOp &writeGsOutputOp) {
+            auto &builder = payload.self.m_builder;
+            builder.SetInsertPoint(&writeGsOutputOp);
+
+            const unsigned streamId = writeGsOutputOp.getStreamId();
+            assert(streamId < MaxGsStreams);
+
+            auto emitVerts = builder.CreateLoad(builder.getInt32Ty(), payload.emitVertsPtrs[streamId]);
+            auto totalEmitVerts = builder.CreateLoad(builder.getInt32Ty(), payload.totalEmitVertsPtr);
+            payload.self.writeGsOutput(writeGsOutputOp.getOutputValue(), writeGsOutputOp.getLocation(),
+                                       writeGsOutputOp.getComponent(), streamId, payload.threadIdInSubgroup, emitVerts,
+                                       totalEmitVerts);
+
+            payload.callsToRemove.push_back(&writeGsOutputOp);
+          })
+          .add<GsEmitStreamOp>([](Payload &payload, GsEmitStreamOp &gsEmitStreamOp) {
+            auto &builder = payload.self.m_builder;
+            builder.SetInsertPoint(&gsEmitStreamOp);
 
-  // Clear removed calls
-  for (auto call : removedCalls) {
+            const unsigned streamId = gsEmitStreamOp.getStreamId();
+            assert(streamId < MaxGsStreams);
+
+            payload.self.processGsEmit(streamId, payload.threadIdInSubgroup, payload.emitVertsPtrs[streamId],
+                                       payload.outVertsPtrs[streamId], payload.totalEmitVertsPtr);
+
+            payload.callsToRemove.push_back(&gsEmitStreamOp);
+          })
+          .add<GsCutStreamOp>([](Payload &payload, GsCutStreamOp &gsCutStreamOp) {
+            auto &builder = payload.self.m_builder;
+            builder.SetInsertPoint(&gsCutStreamOp);
+
+            const unsigned streamId = gsCutStreamOp.getStreamId();
+            assert(streamId < MaxGsStreams);
+
+            payload.self.processGsCut(streamId, payload.outVertsPtrs[streamId]);
+
+            payload.callsToRemove.push_back(&gsCutStreamOp);
+          })
+          .build();
+  visitor.visit(payload, *m_gsHandlers.main);
+
+  for (auto call : callsToRemove) {
     call->dropAllReferences();
     call->eraseFromParent();
   }
@@ -3888,48 +3966,49 @@ void NggPrimShader::runCopyShader(ArrayRef<Argument *> args) {
 // =====================================================================================================================
 // Mutates copy shader to handle the reading GS outputs from GS-VS ring.
 void NggPrimShader::mutateCopyShader() {
-  if (!m_pipelineState->exportAttributeByExportInstruction())
-    exportVertexAttributeThroughMemory(m_gsHandlers.copyShader);
-
   IRBuilder<>::InsertPointGuard guard(m_builder);
 
+  mutateToExportVertex(m_gsHandlers.copyShader);
+
   // Relative vertex index is always the last argument
   auto vertexIndex = getFunctionArgument(m_gsHandlers.copyShader, m_gsHandlers.copyShader->arg_size() - 1);
   const unsigned rasterStream = m_pipelineState->getRasterizerState().rasterStream;
+  assert(rasterStream != InvalidValue);
 
-  SmallVector<Instruction *, 32> removedCalls;
-
-  for (auto &func : m_gsHandlers.copyShader->getParent()->functions()) {
-    if (func.getName().starts_with(lgcName::NggReadGsOutput)) {
-      // Import GS outputs from GS-VS ring
-      for (auto user : func.users()) {
-        CallInst *const call = cast<CallInst>(user);
-
-        if (call->getFunction() != m_gsHandlers.copyShader)
-          continue; // Not belong to copy shader
-
-        m_builder.SetInsertPoint(call);
+  SmallVector<NggReadGsOutputOp *, 32> callsToRemove;
 
-        assert(call->arg_size() == 3);
-        const unsigned location = cast<ConstantInt>(call->getOperand(0))->getZExtValue();
-        const unsigned component = cast<ConstantInt>(call->getOperand(1))->getZExtValue();
-        const unsigned streamId = cast<ConstantInt>(call->getOperand(2))->getZExtValue();
-        assert(streamId < MaxGsStreams);
-
-        // Only lower the GS output import calls if they belong to the rasterization stream.
-        if (streamId == rasterStream) {
-          auto vertexOffset = calcVertexItemOffset(streamId, vertexIndex);
-          auto output = readGsOutput(call->getType(), location, component, streamId, vertexOffset);
-          call->replaceAllUsesWith(output);
-        }
+  struct Payload {
+    NggPrimShader &self;
+    Value *vertexIndex;
+    const unsigned rasterStream;
+    SmallVectorImpl<NggReadGsOutputOp *> &callsToRemove;
+  };
+  Payload payload = {*this, vertexIndex, rasterStream, callsToRemove};
+
+  static const auto visitor =
+      llvm_dialects::VisitorBuilder<Payload>()
+          .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration)
+          .add<NggReadGsOutputOp>([](Payload &payload, NggReadGsOutputOp &readGsOutputOp) {
+            auto &builder = payload.self.m_builder;
+            builder.SetInsertPoint(&readGsOutputOp);
+
+            const unsigned streamId = readGsOutputOp.getStreamId();
+            assert(streamId < MaxGsStreams);
+
+            // Only lower the dialect op if it belongs to the rasterization stream.
+            if (streamId == payload.rasterStream) {
+              auto vertexOffset = payload.self.calcVertexItemOffset(streamId, payload.vertexIndex);
+              auto outputValue = payload.self.readGsOutput(readGsOutputOp.getType(), readGsOutputOp.getLocation(),
+                                                           readGsOutputOp.getComponent(), streamId, vertexOffset);
+              readGsOutputOp.replaceAllUsesWith(outputValue);
+            }
 
-        removedCalls.push_back(call);
-      }
-    }
-  }
+            payload.callsToRemove.push_back(&readGsOutputOp);
+          })
+          .build();
+  visitor.visit(payload, *m_gsHandlers.copyShader);
 
-  // Clear removed calls
-  for (auto call : removedCalls) {
+  for (auto call : callsToRemove) {
     call->dropAllReferences();
     call->eraseFromParent();
   }
@@ -4026,6 +4105,7 @@ void NggPrimShader::appendUserData(SmallVectorImpl<Value *> &args, Function *tar
 // @param totalEmitVerts : Counter of GS emitted vertices for all streams
 void NggPrimShader::writeGsOutput(Value *output, unsigned location, unsigned component, unsigned streamId,
                                   Value *primitiveIndex, Value *emitVerts, llvm::Value *totalEmitVerts) {
+  assert(streamId < MaxGsStreams);
   if (!m_pipelineState->enableSwXfb() && m_pipelineState->getRasterizerState().rasterStream != streamId) {
     // NOTE: If SW-emulated stream-out is not enabled, only import those outputs that belong to the rasterization
     // stream.
@@ -4102,6 +4182,7 @@ void NggPrimShader::writeGsOutput(Value *output, unsigned location, unsigned com
 // @param vertexOffset : Start offset of vertex item in GS-VS ring (in dwords)
 Value *NggPrimShader::readGsOutput(Type *outputTy, unsigned location, unsigned component, unsigned streamId,
                                    Value *vertexOffset) {
+  assert(streamId < MaxGsStreams);
   if (!m_pipelineState->enableSwXfb() && m_pipelineState->getRasterizerState().rasterStream != streamId) {
     // NOTE: If SW-emulated stream-out is not enabled, only import those outputs that belong to the rasterization
     // stream.
@@ -4144,7 +4225,7 @@ Value *NggPrimShader::readGsOutput(Type *outputTy, unsigned location, unsigned c
 }
 
 // =====================================================================================================================
-// Processes the message GS_EMIT.
+// Process the dialect op NggGsEmit.
 //
 // @param streamId : ID of output vertex stream
 // @param primitiveIndex : Relative primitive index in subgroup
@@ -4164,7 +4245,7 @@ void NggPrimShader::processGsEmit(unsigned streamId, Value *primitiveIndex, Valu
 }
 
 // =====================================================================================================================
-// Processes the message GS_CUT.
+// Process the dialect op NggGsCut.
 //
 // @param streamId : ID of output vertex stream
 // @param [in/out] outVertsPtr : Pointer to the counter of GS output vertices of current primitive for this stream
@@ -4179,7 +4260,7 @@ void NggPrimShader::processGsCut(unsigned streamId, Value *outVertsPtr) {
 }
 
 // =====================================================================================================================
-// Creates the function that processes GS_EMIT.
+// Create the function that processes the dialect op NggGsEmit.
 Function *NggPrimShader::createGsEmitHandler() {
   assert(m_hasGs);
 
@@ -4312,7 +4393,7 @@ Function *NggPrimShader::createGsEmitHandler() {
 }
 
 // =====================================================================================================================
-// Creates the function that processes GS_CUT.
+// Create the function that processes the dialect op NggGsCut.
 Function *NggPrimShader::createGsCutHandler() {
   assert(m_hasGs);
 
@@ -6116,156 +6197,225 @@ Value *NggPrimShader::ballot(Value *value) {
 }
 
 // =====================================================================================================================
-// Export vertex attribute through memory (ATM) by handing the calls. We mutate the argument list of the target function
-// by adding three additional arguments (attribute ring buffer descriptor, attribute ring base offset, and relative
-// vertex index in subgroup). Also, we expand all export calls by replacing it with real instructions that do vertex
-// attribute exporting through memory.
+// Append additional arguments to the argument list for attribute-through-memory (ATM) of the specified shader stage.
+// Currently, three arguments are required to do attribute-through-memory:
+//   (1) Attribute ring buffer descriptor;
+//   (2) Attribute ring base offset;
+//   (3) Relative vertex index in NGG subgroup.
 //
-// @param [in/out] target : Target function to process vertex attribute export
-void NggPrimShader::exportVertexAttributeThroughMemory(Function *&target) {
+// @param [in/out] args : The arguments that will be appended to
+void NggPrimShader::appendAttributeThroughMemoryArguments(SmallVectorImpl<llvm::Value *> &args) {
   assert(m_gfxIp.major >= 11);                                    // For GFX11+
   assert(!m_pipelineState->exportAttributeByExportInstruction()); // ATM is allowed
 
   if (!m_attribRingBufDesc && !m_attribRingBaseOffset)
     return; // No ATM, no attributes to export
 
-  IRBuilder<>::InsertPointGuard guard(m_builder);
-
-  //
-  // Mutate the argument list by adding two additional arguments
-  //
-  auto newTarget =
-      addFunctionArgs(target, nullptr,
-                      {
-                          FixedVectorType::get(m_builder.getInt32Ty(), 4), // Attribute ring buffer descriptor (4 SGPRs)
-                          m_builder.getInt32Ty(),                          // Attribute ring base offset (SGPR)
-                          m_builder.getInt32Ty()                           // Relative vertex index in subgroup (VGPR)
-                      },
-                      {"attribRingBufDesc", "attribRingBaseOffset", "vertexIndex"}, 0x3);
-
-  // Original function is no longer needed
-  assert(target->use_empty());
-  target->eraseFromParent();
+  args.push_back(m_attribRingBufDesc);
+  args.push_back(m_attribRingBaseOffset);
+  args.push_back(m_nggInputs.threadIdInSubgroup);
+}
 
-  target = newTarget;
+// =====================================================================================================================
+// Mutate the target function to export vertex (positions and attributes) by lowering position/attribute exporting. If
+// attribute through memory (ATM) is required, we mutate its argument list by adding three additional arguments
+// (attribute ring buffer descriptor, attribute ring base offset, and relative vertex index in subgroup).
+//
+// @param [in/out] target : Target function to process vertex export
+void NggPrimShader::mutateToExportVertex(Function *&target) {
+  Value *attribRingBufDesc = nullptr;
+  Value *attribRingBaseOffset = nullptr;
+  Value *vertexIndex = nullptr;
 
   //
-  // Expand vertex attribute export calls by replacing them with real instructions
+  // Mutate the argument list of the target function for ATM.
   //
+  if (!m_pipelineState->exportAttributeByExportInstruction()) {
+    assert(m_gfxIp.major >= 11); // Must be GFX11+
 
-  // Always the first three arguments, added by us
-  auto attribRingBufDesc = target->getArg(0);
-  auto attribRingBaseOffset = target->getArg(1);
-  auto vertexIndex = target->getArg(2);
-
-  m_builder.SetInsertPointPastAllocas(target);
-
-  SmallVector<CallInst *, 8> removedCalls;
-
-  for (auto &func : target->getParent()->functions()) {
-    if (func.getName().starts_with(lgcName::NggAttributeThroughMemory)) {
-      for (auto user : func.users()) {
-        CallInst *const call = dyn_cast<CallInst>(user);
-        assert(call);
-
-        if (call->getParent()->getParent() != target)
-          continue; // Export call doesn't belong to targeted function, skip
-
-        m_builder.SetInsertPoint(call);
-
-        // Export vertex attributes
-        const unsigned location = cast<ConstantInt>(call->getArgOperand(0))->getZExtValue();
-        auto locationOffset = m_builder.getInt32(location * SizeOfVec4);
-
-        auto attribValue = call->getArgOperand(1);
-        assert(attribValue->getType() == FixedVectorType::get(m_builder.getFloatTy(), 4)); // Must be <4 xfloat>
+    // Could be no ATM
+    if (m_attribRingBufDesc && m_attribRingBaseOffset) {
+      // Mutate the argument list by adding two additional arguments
+      auto newTarget = addFunctionArgs(
+          target, nullptr,
+          {
+              FixedVectorType::get(m_builder.getInt32Ty(), 4), // Attribute ring buffer descriptor (4 SGPRs)
+              m_builder.getInt32Ty(),                          // Attribute ring base offset (SGPR)
+              m_builder.getInt32Ty()                           // Relative vertex index in subgroup (VGPR)
+          },
+          {"attribRingBufDesc", "attribRingBaseOffset", "vertexIndex"}, 0x3);
 
-        CoherentFlag coherent = {};
-        if (m_pipelineState->getTargetInfo().getGfxIpVersion().major <= 11) {
-          coherent.bits.glc = true;
-        }
-        m_builder.CreateIntrinsic(m_builder.getVoidTy(), Intrinsic::amdgcn_struct_buffer_store,
-                                  {attribValue, attribRingBufDesc, vertexIndex, locationOffset, attribRingBaseOffset,
-                                   m_builder.getInt32(coherent.u32All)});
+      // Original function is no longer needed
+      assert(target->use_empty());
+      target->eraseFromParent();
 
-        removedCalls.push_back(call);
-      }
+      target = newTarget;
 
-      break; // Vertex attribute export calls are handled, could exit the loop
+      attribRingBufDesc = target->getArg(0);
+      attribRingBaseOffset = target->getArg(1);
+      vertexIndex = target->getArg(2);
     }
   }
 
-  // NOTE: If the workaround of attributes-through-memory preceding vertex position data is required, we have to collect
-  // all vertex position export calls and move them before the return instruction. This actually places them after the
-  // writing operations of attributes-through-memory
-  if (m_pipelineState->getTargetInfo().getGpuWorkarounds().gfx11.waAtmPrecedesPos) {
-    SmallVector<CallInst *, 4> exportCalls;
+  //
+  // Collect vertex position/attribute exports.
+  //
+  SmallVector<NggExportPositionOp *, 4> exportPositionOps;
+  SmallVector<NggExportAttributeOp *, 4> exportAttributeOps;
+
+  // Collect vertex poistion/attribute exports
+  struct Payload {
+    SmallVectorImpl<NggExportPositionOp *> &exportPositionOps;
+    SmallVectorImpl<NggExportAttributeOp *> &exportAttributeOps;
+  };
+  Payload payload = {exportPositionOps, exportAttributeOps};
+
+  static const auto visitor =
+      llvm_dialects::VisitorBuilder<Payload>()
+          .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration)
+          .add<NggExportPositionOp>([](Payload &payload, NggExportPositionOp &exportPositionOp) {
+            payload.exportPositionOps.push_back(&exportPositionOp);
+          })
+          .add<NggExportAttributeOp>([](Payload &payload, NggExportAttributeOp &exportAttributeOp) {
+            payload.exportAttributeOps.push_back(&exportAttributeOp);
+          })
+          .build();
+  visitor.visit(payload, *target);
+
+  // If there are no position/attribute exports, skip further processing
+  if (exportPositionOps.empty() && exportAttributeOps.empty())
+    return;
 
-    // Colllect export calls of vertex position data
-    for (auto &func : target->getParent()->functions()) {
-      if (func.isIntrinsic() && func.getIntrinsicID() == Intrinsic::amdgcn_exp) {
-        for (auto user : func.users()) {
-          CallInst *const call = dyn_cast<CallInst>(user);
-          assert(call);
+  assert(!exportPositionOps.empty()); // Position0 export is always present
+  ReturnInst *retInst = dyn_cast<ReturnInst>(exportPositionOps[0]->getParent()->getTerminator());
+  assert(retInst);
 
-          if (call->getParent()->getParent() != target)
-            continue; // Export call doesn't belong to targeted function, skip
+  //
+  // Reorder vertex position/attribute exports.
+  //
+  for (auto exportPositionOp : exportPositionOps)
+    exportPositionOp->moveBefore(retInst);
 
-          exportCalls.push_back(call);
-        }
-      }
+  // NOTE: If the workaround of attributes-through-memory preceding vertex position data is required, we have to
+  // place vertex exports after all attribute exports (ATM operations).
+  Instruction *movePoint = retInst;
+  if (m_pipelineState->getTargetInfo().getGpuWorkarounds().gfx11.waAtmPrecedesPos) {
+    if (!exportAttributeOps.empty()) {
+      m_builder.SetInsertPoint(exportPositionOps[0]);
+      movePoint =
+          m_builder.CreateFence(AtomicOrdering::Release, m_builder.getContext().getOrInsertSyncScopeID("agent"));
     }
+  }
 
-    // Move the export calls before the return instructions
-    ReturnInst *retInst = nullptr;
-    for (unsigned i = 0; i < exportCalls.size(); ++i) {
-      auto exportCall = exportCalls[i];
+  for (auto exportAttributeOp : exportAttributeOps)
+    exportAttributeOp->moveBefore(movePoint);
 
-      if (retInst) {
-        // All export calls are expected to be in the same basic block
-        assert(retInst == exportCall->getParent()->getTerminator());
-      } else {
-        retInst = dyn_cast<ReturnInst>(exportCall->getParent()->getTerminator());
-        assert(retInst);
-      }
+  //
+  // Lower vertex position/attribute exports.
+  //
+  for (auto exportPositionOp : exportPositionOps) {
+    m_builder.SetInsertPoint(exportPositionOp);
+    const bool lastExport = exportPositionOp == exportPositionOps[exportPositionOps.size() - 1];
+    exportPosition(exportPositionOp->getExportSlot(),
+                   {exportPositionOp->getExportValue0(), exportPositionOp->getExportValue1(),
+                    exportPositionOp->getExportValue2(), exportPositionOp->getExportValue3()},
+                   lastExport);
+  }
 
-      exportCall->setOperand(
-          6, m_builder.getInt1(i == exportCalls.size() - 1)); // Make export done flag for the last export call
-      exportCall->moveBefore(retInst);
-    }
+  for (auto exportAttributeOp : exportAttributeOps) {
+    m_builder.SetInsertPoint(exportAttributeOp);
+    exportAttribute(exportAttributeOp->getExportSlot(),
+                    {exportAttributeOp->getExportValue0(), exportAttributeOp->getExportValue1(),
+                     exportAttributeOp->getExportValue2(), exportAttributeOp->getExportValue3()},
+                    attribRingBufDesc, attribRingBaseOffset, vertexIndex);
+  }
 
-    // Before the first export call, add s_wait_vscnt 0 to make sure the completion of all attributes being written
-    // to the attribute ring buffer
-    assert(!exportCalls.empty()); // Position export is always present
-    m_builder.SetInsertPoint(exportCalls[0]);
-    m_builder.CreateFence(AtomicOrdering::Release, m_builder.getContext().getOrInsertSyncScopeID("agent"));
+  //
+  // Remove export dialect ops.
+  //
+  for (auto exportPositionOp : exportPositionOps) {
+    exportPositionOp->dropAllReferences();
+    exportPositionOp->eraseFromParent();
   }
 
-  // Remove calls
-  for (auto call : removedCalls) {
-    call->dropAllReferences();
-    call->eraseFromParent();
+  for (auto exportAttributeOp : exportAttributeOps) {
+    exportAttributeOp->dropAllReferences();
+    exportAttributeOp->eraseFromParent();
   }
 }
 
 // =====================================================================================================================
-// Append additional arguments to the argument list for attribute-through-memory (ATM) of the specified shader stage.
-// Currently, three arguments are required to do attribute-through-memory:
-//   (1) Attribute ring buffer descriptor;
-//   (2) Attribute ring base offset;
-//   (3) Relative vertex index in NGG subgroup.
+// Export vertex position.
 //
-// @param [in/out] args : The arguments that will be appended to
-void NggPrimShader::appendAttributeThroughMemoryArguments(SmallVectorImpl<llvm::Value *> &args) {
-  assert(m_gfxIp.major >= 11);                                    // For GFX11+
-  assert(!m_pipelineState->exportAttributeByExportInstruction()); // ATM is allowed
+// @param exportSlot : Export slot
+// @param exportValues : Vertex position values to export
+// @param lastExport : Whether this is the last export
+void NggPrimShader::exportPosition(unsigned exportSlot, ArrayRef<Value *> exportValues, bool lastExport) {
+  assert(exportValues.size() == 4);
+
+  unsigned channelMask = 0;
+  for (unsigned i = 0; i < 4; ++i) {
+    assert(exportValues[i]);
+    if (!isa<UndefValue>(exportValues[i]) && !isa<PoisonValue>(exportValues[i]))
+      channelMask |= (1u << i); // Update channel mask if the value is valid (not unspecified)
+  }
+
+  m_builder.CreateIntrinsic(Intrinsic::amdgcn_exp, m_builder.getFloatTy(),
+                            {m_builder.getInt32(EXP_TARGET_POS_0 + exportSlot), // tgt
+                             m_builder.getInt32(channelMask),                   // en
+                             exportValues[0],                                   // src0
+                             exportValues[1],                                   // src1
+                             exportValues[2],                                   // src2
+                             exportValues[3],                                   // src3
+                             m_builder.getInt1(lastExport),                     // done
+                             m_builder.getFalse()});                            // vm
+}
 
-  if (!m_attribRingBufDesc && !m_attribRingBaseOffset)
-    return; // No ATM, no attributes to export
+// =====================================================================================================================
+// Export vertex attribute.
+//
+// @param exportSlot : Export slot
+// @param exportValues : Vertex attribute values to export
+// @param attribRingBufDesc : Attribute ring buffer descriptor
+// @param attribRingBaseOffset : Subgroup's attribute ring base offset (in bytes)
+// @param vertexIndex :  Vertex index in subgroup
+void NggPrimShader::exportAttribute(unsigned exportSlot, ArrayRef<Value *> exportValues, Value *attribRingBufDesc,
+                                    Value *attribRingBaseOffset, Value *vertexIndex) {
+  assert(exportValues.size() == 4);
+
+  if (m_pipelineState->exportAttributeByExportInstruction()) {
+    unsigned channelMask = 0;
+    for (unsigned i = 0; i < 4; ++i) {
+      assert(exportValues[i]);
+      if (!isa<UndefValue>(exportValues[i]) && !isa<PoisonValue>(exportValues[i]))
+        channelMask |= (1u << i); // Update channel mask if the value is valid (not unspecified)
+    }
+
+    m_builder.CreateIntrinsic(Intrinsic::amdgcn_exp, m_builder.getFloatTy(),
+                              {m_builder.getInt32(EXP_TARGET_PARAM_0 + exportSlot), // tgt
+                               m_builder.getInt32(channelMask),                     // en
+                               exportValues[0],                                     // src0
+                               exportValues[1],                                     // src1
+                               exportValues[2],                                     // src2
+                               exportValues[3],                                     // src3
+                               m_builder.getFalse(),                                // done
+                               m_builder.getFalse()});                              // vm
+  } else {
+    auto locationOffset = m_builder.getInt32(exportSlot * SizeOfVec4);
 
-  args.push_back(m_attribRingBufDesc);
-  args.push_back(m_attribRingBaseOffset);
-  args.push_back(m_nggInputs.threadIdInSubgroup);
+    Value *exportValue = PoisonValue::get(FixedVectorType::get(m_builder.getFloatTy(), 4)); // Must be <4 x float>
+    for (unsigned i = 0; i < 4; ++i)
+      exportValue = m_builder.CreateInsertElement(exportValue, exportValues[i], i);
+
+    CoherentFlag coherent = {};
+    if (m_pipelineState->getTargetInfo().getGfxIpVersion().major <= 11) {
+      coherent.bits.glc = true;
+    }
+
+    m_builder.CreateIntrinsic(m_builder.getVoidTy(), Intrinsic::amdgcn_struct_buffer_store,
+                              {exportValue, attribRingBufDesc, vertexIndex, locationOffset, attribRingBaseOffset,
+                               m_builder.getInt32(coherent.u32All)});
+  }
 }
 
 // =====================================================================================================================
@@ -6403,29 +6553,19 @@ void NggPrimShader::processSwXfb(ArrayRef<Argument *> args) {
                                           : m_builder.getFloatTy(),
           vertexIndices[i], xfbOutputExport.offsetInVertex);
 
-      if (xfbOutputExport.is16bit) {
-        // NOTE: For 16-bit transform feedbakc outputs, they are stored as 32-bit without tightly packed in LDS.
-        outputValue = m_builder.CreateBitCast(
-            outputValue, FixedVectorType::get(m_builder.getInt32Ty(), xfbOutputExport.numElements));
-        outputValue = m_builder.CreateTrunc(outputValue,
-                                            FixedVectorType::get(m_builder.getInt16Ty(), xfbOutputExport.numElements));
-        outputValue = m_builder.CreateBitCast(outputValue,
-                                              FixedVectorType::get(m_builder.getHalfTy(), xfbOutputExport.numElements));
-      }
-
       unsigned format = 0;
       switch (xfbOutputExport.numElements) {
       case 1:
-        format = xfbOutputExport.is16bit ? BUF_FORMAT_16_FLOAT : BUF_FORMAT_32_FLOAT;
+        format = BUF_FORMAT_32_FLOAT;
         break;
       case 2:
-        format = xfbOutputExport.is16bit ? BUF_FORMAT_16_16_FLOAT : BUF_FORMAT_32_32_FLOAT_GFX11;
+        format = BUF_FORMAT_32_32_FLOAT_GFX11;
         break;
       case 3:
-        format = xfbOutputExport.is16bit ? BUF_FORMAT_16_16_FLOAT : BUF_FORMAT_32_32_32_FLOAT_GFX11;
+        format = BUF_FORMAT_32_32_32_FLOAT_GFX11;
         break;
       case 4:
-        format = xfbOutputExport.is16bit ? BUF_FORMAT_16_16_16_16_FLOAT_GFX11 : BUF_FORMAT_32_32_32_32_FLOAT_GFX11;
+        format = BUF_FORMAT_32_32_32_32_FLOAT_GFX11;
         break;
       default:
         llvm_unreachable("Unexpected element number!");
@@ -6445,34 +6585,13 @@ void NggPrimShader::processSwXfb(ArrayRef<Argument *> args) {
       // xfbOutputOffset = vertexOffset + xfbOffset
       Value *xfbOutputOffset = m_builder.CreateAdd(vertexOffset, m_builder.getInt32(xfbOutputExport.xfbOffset));
 
-      if (xfbOutputExport.is16bit && xfbOutputExport.numElements == 3) {
-        // NOTE: For 16vec3, HW doesn't have a corresponding buffer store instruction. We have to split it to 16vec2
-        // and 16scalar.
-        m_builder.CreateIntrinsic(m_builder.getVoidTy(), Intrinsic::amdgcn_raw_tbuffer_store,
-                                  {m_builder.CreateShuffleVector(outputValue, ArrayRef<int>{0, 1}), // vdata
-                                   m_streamOutBufDescs[xfbOutputExport.xfbBuffer],                  // rsrc
-                                   xfbOutputOffset,                                                 // offset
-                                   streamOutOffsets[xfbOutputExport.xfbBuffer],                     // soffset
-                                   m_builder.getInt32(BUF_FORMAT_16_16_FLOAT),                      // format
-                                   m_builder.getInt32(coherent.u32All)});                           // auxiliary data
-
-        m_builder.CreateIntrinsic(m_builder.getVoidTy(), Intrinsic::amdgcn_raw_tbuffer_store,
-                                  {m_builder.CreateExtractElement(outputValue, 2), // vdata
-                                   m_streamOutBufDescs[xfbOutputExport.xfbBuffer], // rsrc
-                                   m_builder.CreateAdd(xfbOutputOffset,
-                                                       m_builder.getInt32(2 * sizeof(uint16_t))), // offset
-                                   streamOutOffsets[xfbOutputExport.xfbBuffer],                   // soffset
-                                   m_builder.getInt32(BUF_FORMAT_16_FLOAT),                       // format
-                                   m_builder.getInt32(coherent.u32All)});                         // auxiliary data
-      } else {
-        m_builder.CreateIntrinsic(m_builder.getVoidTy(), Intrinsic::amdgcn_raw_tbuffer_store,
-                                  {outputValue,                                    // vdata
-                                   m_streamOutBufDescs[xfbOutputExport.xfbBuffer], // rsrc
-                                   xfbOutputOffset,                                // offset
-                                   streamOutOffsets[xfbOutputExport.xfbBuffer],    // soffset
-                                   m_builder.getInt32(format),                     // format
-                                   m_builder.getInt32(coherent.u32All)});          // auxiliary data
-      }
+      m_builder.CreateIntrinsic(m_builder.getVoidTy(), Intrinsic::amdgcn_raw_tbuffer_store,
+                                {outputValue,                                    // vdata
+                                 m_streamOutBufDescs[xfbOutputExport.xfbBuffer], // rsrc
+                                 xfbOutputOffset,                                // offset
+                                 streamOutOffsets[xfbOutputExport.xfbBuffer],    // soffset
+                                 m_builder.getInt32(format),                     // format
+                                 m_builder.getInt32(coherent.u32All)});          // auxiliary data
     }
 
     if (i == possibleVertsPerPrim - 1) {
@@ -6870,29 +6989,19 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef<Argument *> args) {
                            xfbOutputExport.locInfo.location, xfbOutputExport.locInfo.component, i,
                            calcVertexItemOffset(i, vertexIndices[j]));
 
-          if (xfbOutputExport.is16bit) {
-            // NOTE: For 16-bit transform feedbakc outputs, they are stored as 32-bit without tightly packed in LDS.
-            outputValue = m_builder.CreateBitCast(
-                outputValue, FixedVectorType::get(m_builder.getInt32Ty(), xfbOutputExport.numElements));
-            outputValue = m_builder.CreateTrunc(
-                outputValue, FixedVectorType::get(m_builder.getInt16Ty(), xfbOutputExport.numElements));
-            outputValue = m_builder.CreateBitCast(
-                outputValue, FixedVectorType::get(m_builder.getHalfTy(), xfbOutputExport.numElements));
-          }
-
           unsigned format = 0;
           switch (xfbOutputExport.numElements) {
           case 1:
-            format = xfbOutputExport.is16bit ? BUF_FORMAT_16_FLOAT : BUF_FORMAT_32_FLOAT;
+            format = BUF_FORMAT_32_FLOAT;
             break;
           case 2:
-            format = xfbOutputExport.is16bit ? BUF_FORMAT_16_16_FLOAT : BUF_FORMAT_32_32_FLOAT_GFX11;
+            format = BUF_FORMAT_32_32_FLOAT_GFX11;
             break;
           case 3:
-            format = xfbOutputExport.is16bit ? BUF_FORMAT_16_16_FLOAT : BUF_FORMAT_32_32_32_FLOAT_GFX11;
+            format = BUF_FORMAT_32_32_32_FLOAT_GFX11;
             break;
           case 4:
-            format = xfbOutputExport.is16bit ? BUF_FORMAT_16_16_16_16_FLOAT_GFX11 : BUF_FORMAT_32_32_32_32_FLOAT_GFX11;
+            format = BUF_FORMAT_32_32_32_32_FLOAT_GFX11;
             break;
           default:
             llvm_unreachable("Unexpected element number!");
@@ -6913,34 +7022,13 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef<Argument *> args) {
           // xfbOutputOffset = vertexOffset + xfbOffset
           Value *xfbOutputOffset = m_builder.CreateAdd(vertexOffset, m_builder.getInt32(xfbOutputExport.xfbOffset));
 
-          if (xfbOutputExport.is16bit && xfbOutputExport.numElements == 3) {
-            // NOTE: For 16vec3, HW doesn't have a corresponding buffer store instruction. We have to split it to 16vec2
-            // and 16scalar.
-            m_builder.CreateIntrinsic(m_builder.getVoidTy(), Intrinsic::amdgcn_raw_tbuffer_store,
-                                      {m_builder.CreateShuffleVector(outputValue, ArrayRef<int>{0, 1}), // vdata
-                                       m_streamOutBufDescs[xfbOutputExport.xfbBuffer],                  // rsrc
-                                       xfbOutputOffset,                                                 // offset
-                                       streamOutOffsets[xfbOutputExport.xfbBuffer],                     // soffset
-                                       m_builder.getInt32(BUF_FORMAT_16_16_FLOAT),                      // format
-                                       m_builder.getInt32(coherent.u32All)}); // auxiliary data
-
-            m_builder.CreateIntrinsic(
-                m_builder.getVoidTy(), Intrinsic::amdgcn_raw_tbuffer_store,
-                {m_builder.CreateExtractElement(outputValue, 2),                                 // vdata
-                 m_streamOutBufDescs[xfbOutputExport.xfbBuffer],                                 // rsrc
-                 m_builder.CreateAdd(xfbOutputOffset, m_builder.getInt32(2 * sizeof(uint16_t))), // offset
-                 streamOutOffsets[xfbOutputExport.xfbBuffer],                                    // soffset
-                 m_builder.getInt32(BUF_FORMAT_16_FLOAT),                                        // format
-                 m_builder.getInt32(coherent.u32All)});                                          // auxiliary data
-          } else {
-            m_builder.CreateIntrinsic(m_builder.getVoidTy(), Intrinsic::amdgcn_raw_tbuffer_store,
-                                      {outputValue,                                    // vdata
-                                       m_streamOutBufDescs[xfbOutputExport.xfbBuffer], // rsrc
-                                       xfbOutputOffset,                                // offset
-                                       streamOutOffsets[xfbOutputExport.xfbBuffer],    // soffset
-                                       m_builder.getInt32(format),                     // format
-                                       m_builder.getInt32(coherent.u32All)});          // auxiliary data
-          }
+          m_builder.CreateIntrinsic(m_builder.getVoidTy(), Intrinsic::amdgcn_raw_tbuffer_store,
+                                    {outputValue,                                    // vdata
+                                     m_streamOutBufDescs[xfbOutputExport.xfbBuffer], // rsrc
+                                     xfbOutputOffset,                                // offset
+                                     streamOutOffsets[xfbOutputExport.xfbBuffer],    // soffset
+                                     m_builder.getInt32(format),                     // format
+                                     m_builder.getInt32(coherent.u32All)});          // auxiliary data
         }
       }
 
@@ -7168,7 +7256,7 @@ void NggPrimShader::prepareSwXfb(ArrayRef<Value *> primCountInSubgroup) {
 // @param args : Arguments of primitive shader entry-point
 // @param [out] xfbOutputExports : Export info of transform feedback outputs
 Value *NggPrimShader::fetchXfbOutput(Function *target, ArrayRef<Argument *> args,
-                                     SmallVector<XfbOutputExport, 32> &xfbOutputExports) {
+                                     SmallVectorImpl<XfbOutputExport> &xfbOutputExports) {
   assert(m_pipelineState->enableSwXfb());
 
   auto resUsage = m_pipelineState->getShaderResourceUsage(
@@ -7182,23 +7270,10 @@ Value *NggPrimShader::fetchXfbOutput(Function *target, ArrayRef<Argument *> args
   //
   // Clone the target function or just mutate the target function to fetch transform feedback outputs
   //
+  auto savedInsertPos = m_builder.saveIP();
 
   // We don't clone the target function if we are in passthrough mode without GS
-  bool dontClone = !m_hasGs && m_nggControl->passthroughMode;
-
-  // Collect all export calls for further analysis
-  SmallVector<Function *, 8> expFuncs;
-  for (auto &func : target->getParent()->functions()) {
-    if (dontClone) {
-      if (func.getName().starts_with(lgcName::NggXfbExport))
-        expFuncs.push_back(&func);
-    } else {
-      if ((func.isIntrinsic() && func.getIntrinsicID() == Intrinsic::amdgcn_exp) ||
-          func.getName().starts_with(lgcName::NggAttributeThroughMemory) ||
-          func.getName().starts_with(lgcName::NggXfbExport))
-        expFuncs.push_back(&func);
-    }
-  }
+  const bool makeClone = m_hasGs || !m_nggControl->passthroughMode;
 
   // Clone or mutate the target function
   xfbOutputExports.resize(xfbOutputCount);
@@ -7211,16 +7286,7 @@ Value *NggPrimShader::fetchXfbOutput(Function *target, ArrayRef<Argument *> args
   Type *xfbReturnTy = m_hasGs ? m_builder.getVoidTy() : xfbOutputsTy;
 
   Function *xfbFetcher = target;
-  if (dontClone) {
-    if (!m_pipelineState->exportAttributeByExportInstruction())
-      exportVertexAttributeThroughMemory(target);
-
-    xfbFetcher = addFunctionArgs(target, xfbReturnTy, {}, {}, 0);
-
-    // Original target function is no longer needed
-    assert(target->use_empty());
-    target->eraseFromParent();
-  } else {
+  if (makeClone) {
     auto xfbFetcherTy = FunctionType::get(xfbReturnTy, target->getFunctionType()->params(), false);
     xfbFetcher = Function::Create(xfbFetcherTy, target->getLinkage(), "", target->getParent());
 
@@ -7233,6 +7299,15 @@ Value *NggPrimShader::fetchXfbOutput(Function *target, ArrayRef<Argument *> args
     SmallVector<ReturnInst *, 8> retInsts;
     CloneFunctionInto(xfbFetcher, target, valueMap, CloneFunctionChangeType::LocalChangesOnly, retInsts);
     xfbFetcher->setName(NggXfbFetcher);
+  } else {
+    mutateToExportVertex(target);
+
+    xfbFetcher = addFunctionArgs(target, xfbReturnTy, {}, {}, 0);
+
+    // Original target function is no longer needed
+    assert(target->use_empty());
+    target->eraseFromParent();
+    target = nullptr;
   }
 
   // Find the return block
@@ -7248,126 +7323,132 @@ Value *NggPrimShader::fetchXfbOutput(Function *target, ArrayRef<Argument *> args
     }
   }
   assert(retBlock);
-
-  auto savedInsertPos = m_builder.saveIP();
   m_builder.SetInsertPoint(retBlock);
 
-  // Visit all export calls, removing those unnecessary and mutating the return type
-  SmallVector<CallInst *, 8> removedCalls;
+  // Visit XFB writes and vertex position/attribute exports by lowering or removing them, and mutating the return type
+  SmallVector<CallInst *, 8> callsToRemove;
 
   Value *xfbOutputs = PoisonValue::get(xfbOutputsTy);
   unsigned outputIndex = 0;
   unsigned offsetInVertex = 0;
 
-  for (auto func : expFuncs) {
-    for (auto user : func->users()) {
-      CallInst *const call = dyn_cast<CallInst>(user);
-      assert(call);
+  struct Payload {
+    NggPrimShader &self;
+    Value *&xfbOutputs;
+    unsigned &outputIndex;
+    unsigned &offsetInVertex;
+    SmallVectorImpl<XfbOutputExport> &xfbOutputExports;
+    SmallVectorImpl<CallInst *> &callsToRemove;
+  };
+  Payload payload = {*this, xfbOutputs, outputIndex, offsetInVertex, xfbOutputExports, callsToRemove};
+
+  static const auto visitor =
+      llvm_dialects::VisitorBuilder<Payload>()
+          .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration)
+          .add<WriteXfbOutputOp>([](Payload &payload, WriteXfbOutputOp &writeXfbOutputOp) {
+            auto &builder = payload.self.m_builder;
+
+            auto xfbBuffer = writeXfbOutputOp.getXfbBuffer();
+            auto xfbOffset = writeXfbOutputOp.getXfbOffset();
+            auto outputValue = writeXfbOutputOp.getOutputValue();
+            assert(outputValue->getType()->getScalarSizeInBits() == 32);
+
+            const unsigned numElements = outputValue->getType()->isVectorTy()
+                                             ? cast<FixedVectorType>(outputValue->getType())->getNumElements()
+                                             : 1;
+            assert(numElements <= 4);
+
+            // Those values are just for GS
+            auto streamId = InvalidValue;
+            unsigned location = InvalidValue;
+            unsigned component = InvalidValue;
+
+            if (payload.self.m_hasGs) {
+              // NOTE: For GS, the output value must be loaded by NggReadGsOutputOp. This is generated by copy
+              // shader.
+              NggReadGsOutputOp *readGsOutputOp = dyn_cast<NggReadGsOutputOp>(outputValue);
+              streamId = writeXfbOutputOp.getStreamId();
+              assert(streamId == readGsOutputOp->getStreamId()); // Stream ID must match
+              location = readGsOutputOp->getLocation();
+              component = readGsOutputOp->getComponent();
+            } else {
+              // If the output value is floating point, cast it to integer type
+              if (outputValue->getType()->isFPOrFPVectorTy()) {
+                if (numElements == 1) {
+                  outputValue = builder.CreateBitCast(outputValue, builder.getInt32Ty());
+                } else {
+                  outputValue =
+                      builder.CreateBitCast(outputValue, FixedVectorType::get(builder.getInt32Ty(), numElements));
+                }
+              }
+
+              // Always pad the output value to <4 x i32>
+              if (numElements == 1) {
+                outputValue =
+                    builder.CreateInsertElement(PoisonValue::get(FixedVectorType::get(builder.getInt32Ty(), 4)),
+                                                outputValue, static_cast<uint64_t>(0));
+              } else if (numElements < 4) {
+                outputValue = builder.CreateShuffleVector(outputValue, PoisonValue::get(outputValue->getType()),
+                                                          ArrayRef<int>({0U, 1U, 2U, 3U}));
+              }
+            }
 
-      if (!dontClone) {
-        // Remove transform feedback export calls from the target function. No need of doing this if we
-        // just mutate it without cloning.
-        if (call->getFunction() == target && func->getName().starts_with(lgcName::NggXfbExport)) {
-          removedCalls.push_back(call);
-          continue;
-        }
-      }
+            // For VS/TES, return the output value
+            if (!payload.self.m_hasGs)
+              payload.xfbOutputs = builder.CreateInsertValue(payload.xfbOutputs, outputValue, payload.outputIndex);
 
-      if (call->getFunction() != xfbFetcher)
-        continue;
+            // Collect export info
+            payload.xfbOutputExports[payload.outputIndex].xfbBuffer = xfbBuffer;
+            payload.xfbOutputExports[payload.outputIndex].xfbOffset = xfbOffset;
+            payload.xfbOutputExports[payload.outputIndex].numElements = numElements;
 
-      assert(call->getParent() == retBlock); // Must in return block
-
-      if (func->getName().starts_with(lgcName::NggXfbExport)) {
-        // Lower transform feedback export calls
-        auto xfbBuffer = cast<ConstantInt>(call->getArgOperand(0))->getZExtValue();
-        auto xfbOffset = cast<ConstantInt>(call->getArgOperand(1))->getZExtValue();
-        auto outputValue = call->getArgOperand(3);
-
-        const unsigned numElements =
-            outputValue->getType()->isVectorTy() ? cast<FixedVectorType>(outputValue->getType())->getNumElements() : 1;
-        const bool is16bit = outputValue->getType()->getScalarSizeInBits() == 16;
-
-        // Those values are just for GS
-        auto streamId = InvalidValue;
-        unsigned location = InvalidValue;
-        unsigned component = InvalidValue;
-
-        if (m_hasGs) {
-          // NOTE: For GS, the output value must be loaded by GS read output call. This is generated by copy shader.
-          CallInst *readCall = dyn_cast<CallInst>(outputValue);
-          assert(readCall && readCall->getCalledFunction()->getName().starts_with(lgcName::NggReadGsOutput));
-          streamId = cast<ConstantInt>(call->getArgOperand(2))->getZExtValue();
-          assert(streamId == cast<ConstantInt>(readCall->getArgOperand(2))->getZExtValue()); // Stream ID must match
-          location = cast<ConstantInt>(readCall->getArgOperand(0))->getZExtValue();
-          component = cast<ConstantInt>(readCall->getArgOperand(1))->getZExtValue();
-        } else {
-          // If the output value is floating point, cast it to integer type
-          if (outputValue->getType()->isFPOrFPVectorTy()) {
-            if (numElements == 1) {
-              outputValue =
-                  m_builder.CreateBitCast(outputValue, is16bit ? m_builder.getInt16Ty() : m_builder.getInt32Ty());
+            if (payload.self.m_hasGs) {
+              // Update fields for GS to use
+              payload.xfbOutputExports[payload.outputIndex].locInfo.streamId = streamId;
+              payload.xfbOutputExports[payload.outputIndex].locInfo.location = location;
+              payload.xfbOutputExports[payload.outputIndex].locInfo.component = component;
             } else {
-              outputValue = m_builder.CreateBitCast(
-                  outputValue,
-                  FixedVectorType::get(is16bit ? m_builder.getInt16Ty() : m_builder.getInt32Ty(), numElements));
+              // Update the field for ES to use
+              payload.xfbOutputExports[payload.outputIndex].offsetInVertex = payload.offsetInVertex;
+              payload.offsetInVertex += numElements; // Increment the offset
             }
-          }
-
-          // If the output value is 16-bit, zero-extend it to 32-bit
-          if (is16bit)
-            outputValue = m_builder.CreateZExt(outputValue, FixedVectorType::get(m_builder.getInt32Ty(), numElements));
-
-          // Always pad the output value to <4 x i32>
-          if (numElements == 1) {
-            outputValue =
-                m_builder.CreateInsertElement(PoisonValue::get(FixedVectorType::get(m_builder.getInt32Ty(), 4)),
-                                              outputValue, static_cast<uint64_t>(0));
-          } else if (numElements < 4) {
-            outputValue = m_builder.CreateShuffleVector(outputValue, PoisonValue::get(outputValue->getType()),
-                                                        ArrayRef<int>({0U, 1U, 2U, 3U}));
-          }
-        }
-
-        // For VS/TES, return the output value
-        if (!m_hasGs)
-          xfbOutputs = m_builder.CreateInsertValue(xfbOutputs, outputValue, outputIndex);
-
-        // Collect export info
-        xfbOutputExports[outputIndex].xfbBuffer = xfbBuffer;
-        xfbOutputExports[outputIndex].xfbOffset = xfbOffset;
-        xfbOutputExports[outputIndex].numElements = numElements;
-        xfbOutputExports[outputIndex].is16bit = is16bit;
-
-        if (m_hasGs) {
-          // Update fields for GS to use
-          xfbOutputExports[outputIndex].locInfo.streamId = streamId;
-          xfbOutputExports[outputIndex].locInfo.location = location;
-          xfbOutputExports[outputIndex].locInfo.component = component;
-        } else {
-          // Update the field for ES to use
-          xfbOutputExports[outputIndex].offsetInVertex = offsetInVertex;
-
-          unsigned xfbOutputSize = numElements;
-          // Double the size if 64-bit output
-          if (outputValue->getType()->getScalarSizeInBits() == 64)
-            xfbOutputSize *= 2;
-          offsetInVertex += xfbOutputSize; // Increment the offset
-        }
 
-        ++outputIndex;
-      }
+            ++payload.outputIndex;
 
-      removedCalls.push_back(call); // Remove export
-    }
-  }
+            payload.callsToRemove.push_back(&writeXfbOutputOp);
+          })
+          .add<NggExportPositionOp>([](Payload &payload, NggExportPositionOp &exportPositionOp) {
+            payload.callsToRemove.push_back(&exportPositionOp);
+          })
+          .add<NggExportAttributeOp>([](Payload &payload, NggExportAttributeOp &exportAttributeOp) {
+            payload.callsToRemove.push_back(&exportAttributeOp);
+          })
+          .build();
+  visitor.visit(payload, *xfbFetcher);
 
   assert(outputIndex == xfbOutputCount); // Visit all transform feedback export calls
 
   m_builder.CreateRet(xfbOutputs);
 
-  // Remove calls
-  for (auto call : removedCalls) {
+  // Remove XFB writes in original target function
+  if (makeClone) {
+    assert(target);
+
+    struct Payload {
+      SmallVectorImpl<CallInst *> &callsToRemove;
+    };
+    Payload payload = {callsToRemove};
+
+    static const auto visitor = llvm_dialects::VisitorBuilder<Payload>()
+                                    .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration)
+                                    .add<WriteXfbOutputOp>([](Payload &payload, WriteXfbOutputOp &writeXfbOutputOp) {
+                                      payload.callsToRemove.push_back(&writeXfbOutputOp);
+                                    })
+                                    .build();
+    visitor.visit(payload, *target);
+  }
+
+  for (auto call : callsToRemove) {
     call->dropAllReferences();
     call->eraseFromParent();
   }
@@ -7417,7 +7498,7 @@ Value *NggPrimShader::fetchXfbOutput(Function *target, ArrayRef<Argument *> args
   SmallVector<Value *, 32> xfbFetcherArgs;
 
   // If we don't clone the target function, we are going to run it and handle vertex attribute through memory here.
-  if (dontClone) {
+  if (!makeClone) {
     if (!m_pipelineState->exportAttributeByExportInstruction()) {
       if (!m_hasGs) // For GS, ATM is done in copy shader
         appendAttributeThroughMemoryArguments(xfbFetcherArgs);
@@ -7699,7 +7780,7 @@ Value *NggPrimShader::readXfbOutputFromLds(Type *readDataTy, Value *vertexIndex,
   assert(!m_hasGs);
 
   const unsigned esGsRingItemSize =
-      m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.calcFactor.esGsRingItemSize;
+      m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.hwConfig.esGsRingItemSize;
   auto vertexItemOffset = m_builder.CreateMul(vertexIndex, m_builder.getInt32(esGsRingItemSize));
 
   if (m_nggControl->passthroughMode) {
@@ -7726,7 +7807,7 @@ void NggPrimShader::writeXfbOutputToLds(Value *writeData, Value *vertexIndex, un
   assert(!m_hasGs);
 
   const unsigned esGsRingItemSize =
-      m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.calcFactor.esGsRingItemSize;
+      m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.hwConfig.esGsRingItemSize;
   auto vertexItemOffset = m_builder.CreateMul(vertexIndex, m_builder.getInt32(esGsRingItemSize));
 
   if (m_nggControl->passthroughMode) {
@@ -7759,6 +7840,7 @@ Value *NggPrimShader::fetchVertexPositionData(Value *vertexIndex) {
   assert(inOutUsage.builtInOutputLocMap.find(BuiltInPosition) != inOutUsage.builtInOutputLocMap.end());
   const unsigned loc = inOutUsage.builtInOutputLocMap[BuiltInPosition];
   const unsigned rasterStream = m_pipelineState->getRasterizerState().rasterStream;
+  assert(rasterStream != InvalidValue);
   auto vertexOffset = calcVertexItemOffset(rasterStream, vertexIndex);
 
   return readGsOutput(FixedVectorType::get(m_builder.getFloatTy(), 4), loc, 0, rasterStream, vertexOffset);
@@ -7774,7 +7856,7 @@ Value *NggPrimShader::fetchCullDistanceSignMask(Value *vertexIndex) {
   if (!m_hasGs) {
     // ES-only
     const unsigned esGsRingItemSize =
-        m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.calcFactor.esGsRingItemSize;
+        m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.hwConfig.esGsRingItemSize;
     auto vertexItemOffset = m_builder.CreateMul(vertexIndex, m_builder.getInt32(esGsRingItemSize));
     return readVertexCullInfoFromLds(m_builder.getInt32Ty(), vertexItemOffset,
                                      m_vertCullInfoOffsets.cullDistanceSignMask);
@@ -7785,6 +7867,7 @@ Value *NggPrimShader::fetchCullDistanceSignMask(Value *vertexIndex) {
   assert(inOutUsage.builtInOutputLocMap.find(BuiltInCullDistance) != inOutUsage.builtInOutputLocMap.end());
   const unsigned loc = inOutUsage.builtInOutputLocMap[BuiltInCullDistance];
   const unsigned rasterStream = m_pipelineState->getRasterizerState().rasterStream;
+  assert(rasterStream != InvalidValue);
   auto vertexOffset = calcVertexItemOffset(rasterStream, vertexIndex);
 
   auto &builtInUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->builtInUsage.gs;
@@ -7812,11 +7895,12 @@ Value *NggPrimShader::fetchCullDistanceSignMask(Value *vertexIndex) {
 // @param vertexIndex : Relative vertex index in NGG subgroup.
 Value *NggPrimShader::calcVertexItemOffset(unsigned streamId, Value *vertexIndex) {
   assert(m_hasGs); // GS must be present
+  assert(streamId < MaxGsStreams);
 
   auto &inOutUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage;
 
   // vertexOffset = gsVsRingStart + streamBases[stream] + vertexIndex * vertexItemSize (in dwords)
-  const unsigned vertexItemSize = inOutUsage.gs.calcFactor.gsVsVertexItemSize[streamId];
+  const unsigned vertexItemSize = inOutUsage.gs.hwConfig.gsVsVertexItemSize[streamId];
 
   auto vertexOffset = m_builder.CreateMul(vertexIndex, m_builder.getInt32(vertexItemSize));
   vertexOffset = m_builder.CreateAdd(vertexOffset, m_builder.getInt32(m_gsStreamBases[streamId]));
diff --git a/lgc/patch/NggPrimShader.h b/lgc/patch/NggPrimShader.h
index 4f60a3e7a9..4150e51fc0 100644
--- a/lgc/patch/NggPrimShader.h
+++ b/lgc/patch/NggPrimShader.h
@@ -152,7 +152,6 @@ struct XfbOutputExport {
   unsigned xfbBuffer;   // Transform feedback buffer
   unsigned xfbOffset;   // Transform feedback offset
   unsigned numElements; // Number of output elements, valid range is [1,4]
-  bool is16bit;         // Whether the output is 16-bit
   // For ES only
   unsigned offsetInVertex; // Offset of an output within all transform feedback outputs of a vertex
   // For GS only
@@ -299,14 +298,17 @@ class NggPrimShader {
   llvm::Value *fetchCullDistanceSignMask(llvm::Value *vertexIndex);
   llvm::Value *calcVertexItemOffset(unsigned streamId, llvm::Value *vertexIndex);
 
-  void exportVertexAttributeThroughMemory(llvm::Function *&target);
   void appendAttributeThroughMemoryArguments(llvm::SmallVectorImpl<llvm::Value *> &args);
+  void mutateToExportVertex(llvm::Function *&target);
+  void exportPosition(unsigned exportSlot, llvm::ArrayRef<llvm::Value *> exportValues, bool lastExport);
+  void exportAttribute(unsigned exportSlot, llvm::ArrayRef<llvm::Value *> exportValues, llvm::Value *attribRingBufDesc,
+                       llvm::Value *attribRingBaseOffset, llvm::Value *vertexIndex);
 
   void processSwXfb(llvm::ArrayRef<llvm::Argument *> args);
   void processSwXfbWithGs(llvm::ArrayRef<llvm::Argument *> args);
   void prepareSwXfb(llvm::ArrayRef<llvm::Value *> primCountInSubgroup);
   llvm::Value *fetchXfbOutput(llvm::Function *target, llvm::ArrayRef<llvm::Argument *> args,
-                              llvm::SmallVector<XfbOutputExport, 32> &xfbOutputExports);
+                              llvm::SmallVectorImpl<XfbOutputExport> &xfbOutputExports);
 
   llvm::Value *readXfbOutputFromLds(llvm::Type *readDataTy, llvm::Value *vertexIndex, unsigned offsetInVertex);
   void writeXfbOutputToLds(llvm::Value *writeData, llvm::Value *vertexIndex, unsigned offsetInVertex);
diff --git a/lgc/patch/PassRegistry.inc b/lgc/patch/PassRegistry.inc
index d9cb51c5b9..8298a4ee03 100644
--- a/lgc/patch/PassRegistry.inc
+++ b/lgc/patch/PassRegistry.inc
@@ -56,11 +56,11 @@ LLPC_MODULE_PASS("lgc-pipeline-state-recorder", PipelineStateRecorder)
 LLPC_MODULE_PASS("lgc-builder-replayer", BuilderReplayer)
 LLPC_MODULE_PASS("lgc-continufy", Continufy)
 LLPC_MODULE_PASS("lgc-collect-resource-usage", CollectResourceUsage)
-LLPC_MODULE_PASS("lgc-patch-initialize-workgroup-memory", PatchInitializeWorkgroupMemory)
+LLPC_MODULE_PASS("lgc-initialize-workgroup-memory", InitializeWorkgroupMemory)
 LLPC_MODULE_PASS("lgc-lower-image-derivatives", LowerImageDerivatives)
 LLPC_MODULE_PASS("lgc-lower-in-out", LowerInOut)
 LLPC_FUNCTION_PASS("lgc-lower-invariant-loads", LowerInvariantLoads)
-LLPC_MODULE_PASS("lgc-patch-setup-target-features", PatchSetupTargetFeatures)
+LLPC_MODULE_PASS("lgc-set-up-target-features", SetUpTargetFeatures)
 LLPC_MODULE_PASS("lgc-generate-copy-shader", GenerateCopyShader)
 LLPC_MODULE_PASS("lgc-patch-prepare-pipeline-abi", PreparePipelineAbi)
 LLPC_FUNCTION_PASS("lgc-lower-read-first-lane", LowerReadFirstLane)
@@ -71,15 +71,15 @@ LLPC_MODULE_PASS("lgc-mutate-entry-point", MutateEntryPoint)
 LLPC_MODULE_PASS("lgc-patch-check-shader-cache", CheckShaderCache)
 LLPC_LOOP_PASS("lgc-add-loop-metadata", AddLoopMetadata)
 LLPC_FUNCTION_PASS("lgc-structurize-buffers", StructurizeBuffers)
-LLPC_FUNCTION_PASS("lgc-patch-buffer-op", PatchBufferOp)
+LLPC_FUNCTION_PASS("lgc-lower-buffer-operations", LowerBufferOperations)
 LLPC_MODULE_PASS("lgc-apply-workarounds", ApplyWorkarounds)
 LLPC_FUNCTION_PASS("lgc-scalarizer-loads", ScalarizeLoads)
 LLPC_FUNCTION_PASS("lgc-lower-mul-dx9-zero", LowerMulDx9Zero)
 LLPC_MODULE_PASS("lgc-generate-null-frag-shader", GenerateNullFragmentShader)
-LLPC_MODULE_PASS("lgc-patch-tcs-passthrough-shader", TcsPassthroughShader)
+LLPC_MODULE_PASS("lgc-passthrough-hull-shader", PassthroughHullShader)
 LLPC_MODULE_PASS("lgc-collect-image-operations", CollectImageOperations)
 LLPC_MODULE_PASS("lgc-vertex-fetch", LowerVertexFetch)
-LLPC_MODULE_PASS("lgc-frag-color-export", LowerFragColorExport)
+LLPC_MODULE_PASS("lgc-frag-color-export", LowerFragmentColorExport)
 LLPC_MODULE_PASS("lgc-lower-debug-printf", LowerDebugPrintf)
 LLPC_MODULE_PASS("lgc-lower-desc", LowerDesc)
 
diff --git a/lgc/patch/PassthroughHullShader.cpp b/lgc/patch/PassthroughHullShader.cpp
index 6556dc1eab..9529503d54 100644
--- a/lgc/patch/PassthroughHullShader.cpp
+++ b/lgc/patch/PassthroughHullShader.cpp
@@ -25,7 +25,7 @@
 /**
  ***********************************************************************************************************************
  * @file  PassthroughHullShader.cpp
- * @brief LLPC source file: contains declaration and implementation of class lgc::TcsPassthroughShader.
+ * @brief LLPC source file: contains declaration and implementation of class lgc::PassthroughHullShader.
  ***********************************************************************************************************************
  */
 #include "lgc/patch/PassthroughHullShader.h"
@@ -45,7 +45,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/Support/Debug.h"
 
-#define DEBUG_TYPE "lgc-patch-tcs-passthrough-shader"
+#define DEBUG_TYPE "lgc-passthrough-hull-shader"
 
 using namespace lgc;
 using namespace llvm;
@@ -56,13 +56,13 @@ using namespace llvm;
 // @param module : LLVM module to be run on
 // @param analysisManager : Analysis manager to use for this transformation
 // @returns : The preserved analyses (The analyses that are still valid after this pass)
-PreservedAnalyses TcsPassthroughShader::run(Module &module, ModuleAnalysisManager &analysisManager) {
-  LLVM_DEBUG(dbgs() << "Run the pass TCS pass-through shader\n");
+PreservedAnalyses PassthroughHullShader::run(Module &module, ModuleAnalysisManager &analysisManager) {
+  LLVM_DEBUG(dbgs() << "Run the pass pass-through hull shader\n");
 
   PipelineShadersResult &pipelineShaders = analysisManager.getResult<PipelineShaders>(module);
   PipelineState *pipelineState = analysisManager.getResult<PipelineStateWrapper>(module).getPipelineState();
 
-  generateTcsPassthroughShader(module, pipelineShaders, pipelineState);
+  generatePassthroughHullShader(module, pipelineShaders, pipelineState);
   updatePipelineState(module, pipelineState);
 
   return PreservedAnalyses::none();
@@ -73,7 +73,7 @@ PreservedAnalyses TcsPassthroughShader::run(Module &module, ModuleAnalysisManage
 //
 // @param module : LLVM module to be run on
 // @param pipelineState : The pipeline state read from module.
-void TcsPassthroughShader::updatePipelineState(Module &module, PipelineState *pipelineState) const {
+void PassthroughHullShader::updatePipelineState(Module &module, PipelineState *pipelineState) const {
   pipelineState->setShaderStageMask(pipelineState->getShaderStageMask() | ShaderStageMask(ShaderStage::TessControl));
 
   TessellationMode tessellationMode = pipelineState->getShaderModes()->getTessellationMode();
@@ -94,10 +94,10 @@ void TcsPassthroughShader::updatePipelineState(Module &module, PipelineState *pi
 // @param pipelineShaders : Pipeline shaders analysis result
 // @param pipelineState : The pipeline state read from module.
 // @returns : the entry point for the TCS pass-through shader.
-Function *TcsPassthroughShader::generateTcsPassthroughShader(Module &module, PipelineShadersResult &pipelineShaders,
-                                                             PipelineState *pipelineState) {
+Function *PassthroughHullShader::generatePassthroughHullShader(Module &module, PipelineShadersResult &pipelineShaders,
+                                                               PipelineState *pipelineState) {
   Function *entryPoint = generateTcsPassthroughEntryPoint(module, pipelineState);
-  generateTcsPassthroughShaderBody(module, pipelineShaders, pipelineState, entryPoint);
+  generatePassthroughHullShaderBody(module, pipelineShaders, pipelineState, entryPoint);
   return entryPoint;
 }
 
@@ -107,7 +107,7 @@ Function *TcsPassthroughShader::generateTcsPassthroughShader(Module &module, Pip
 // @param module : The LLVM module in which to add the shader.
 // @param pipelineState : The pipeline state read from module.
 // @returns : The new entry point.
-Function *TcsPassthroughShader::generateTcsPassthroughEntryPoint(Module &module, PipelineState *pipelineState) {
+Function *PassthroughHullShader::generateTcsPassthroughEntryPoint(Module &module, PipelineState *pipelineState) {
   FunctionType *entryPointTy = FunctionType::get(Type::getVoidTy(module.getContext()), ArrayRef<Type *>(), false);
   Function *entryPoint =
       Function::Create(entryPointTy, GlobalValue::ExternalLinkage, lgcName::TcsPassthroughEntryPoint, &module);
@@ -124,8 +124,8 @@ Function *TcsPassthroughShader::generateTcsPassthroughEntryPoint(Module &module,
 // @param pipelineShaders : Pipeline shaders analysis result
 // @param pipelineState : The pipeline state read from module.
 // @param entryPointName : the entry point for the TCS pass-through shader.
-void TcsPassthroughShader::generateTcsPassthroughShaderBody(Module &module, PipelineShadersResult &pipelineShaders,
-                                                            PipelineState *pipelineState, Function *entryPoint) {
+void PassthroughHullShader::generatePassthroughHullShaderBody(Module &module, PipelineShadersResult &pipelineShaders,
+                                                              PipelineState *pipelineState, Function *entryPoint) {
   BasicBlock *block = BasicBlock::Create(entryPoint->getContext(), "", entryPoint);
 
   BuilderBase builder(module.getContext());
diff --git a/lgc/patch/PeepholeOptimization.cpp b/lgc/patch/PeepholeOptimization.cpp
index 8e5989a97b..7d0eca2bb0 100644
--- a/lgc/patch/PeepholeOptimization.cpp
+++ b/lgc/patch/PeepholeOptimization.cpp
@@ -31,7 +31,10 @@
 #include "lgc/patch/PeepholeOptimization.h"
 #include "lgc/Builder.h"
 #include "lgc/patch/LgcLowering.h"
+#include "lgc/state/PipelineState.h"
+#include "lgc/util/Internal.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/CommandLine.h"
@@ -44,7 +47,25 @@ using namespace lgc;
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
-namespace lgc {
+namespace {
+
+class PeepholeOptimizer : public llvm::InstVisitor<PeepholeOptimizer> {
+
+public:
+  PeepholeOptimizer(const ShaderOptions *shaderOptions) : m_changed(false), m_shaderOptions(shaderOptions) {}
+
+  bool run(Function &function);
+
+  void visitIntToPtr(IntToPtrInst &intToPtr);
+  void visitCallInst(CallInst &callInst);
+
+private:
+  bool m_changed;
+  const ShaderOptions *m_shaderOptions;
+  llvm::SmallVector<llvm::Instruction *, 8> m_instsToErase;
+};
+
+} // anonymous namespace
 
 // =====================================================================================================================
 // Executes this LLVM pass on the specified LLVM function.
@@ -55,8 +76,25 @@ namespace lgc {
 PreservedAnalyses PeepholeOptimization::run(Function &function, FunctionAnalysisManager &analysisManager) {
   LLVM_DEBUG(dbgs() << "Run the pass Peephole optimization\n");
 
-  m_changed = false;
+  const auto &moduleAnalysisManager = analysisManager.getResult<ModuleAnalysisManagerFunctionProxy>(function);
+  PipelineState *pipelineState =
+      moduleAnalysisManager.getCachedResult<PipelineStateWrapper>(*function.getParent())->getPipelineState();
+  auto shaderStage = getShaderStage(&function);
+  const ShaderOptions *shaderOptions = nullptr;
+  if (shaderStage)
+    shaderOptions = &pipelineState->getShaderOptions(shaderStage.value());
+
+  PeepholeOptimizer pho(shaderOptions);
+  bool changed = pho.run(function);
+  return changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
 
+// =====================================================================================================================
+// Apply peephole optimizations to the function
+//
+// @param [in/out] function : Function that we will peephole optimize.
+// @returns : true if any change was made
+bool PeepholeOptimizer::run(Function &function) {
   visit(function);
 
   const bool changed = m_changed || !m_instsToErase.empty();
@@ -67,7 +105,7 @@ PreservedAnalyses PeepholeOptimization::run(Function &function, FunctionAnalysis
   }
   m_instsToErase.clear();
 
-  return changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+  return changed;
 }
 
 // =====================================================================================================================
@@ -85,7 +123,7 @@ PreservedAnalyses PeepholeOptimization::run(Function &function, FunctionAnalysis
 // Reference: https://groups.google.com/g/llvm-dev/c/x4K7ppGLbg8/m/f_3NySRhjlcJ
 
 // @param intToPtr: The "inttoptr" instruction to visit.
-void PeepholeOptimization::visitIntToPtr(IntToPtrInst &intToPtr) {
+void PeepholeOptimizer::visitIntToPtr(IntToPtrInst &intToPtr) {
   // Check if we are using add to do pointer arithmetic.
   auto *const binaryOperator = dyn_cast<BinaryOperator>(intToPtr.getOperand(0));
   if (!binaryOperator || binaryOperator->getOpcode() != Instruction::Add)
@@ -117,6 +155,8 @@ void PeepholeOptimization::visitIntToPtr(IntToPtrInst &intToPtr) {
   // Create a getelementptr instruction (using offset / size).
   const DataLayout &dataLayout = intToPtr.getModule()->getDataLayout();
   const uint64_t size = dataLayout.getTypeAllocSize(elementType);
+  if (size == 0)
+    return;
   APInt index = constOffset->getValue().udiv(size);
   if (constOffset->getValue().urem(size) != 0)
     return;
@@ -140,14 +180,26 @@ void PeepholeOptimization::visitIntToPtr(IntToPtrInst &intToPtr) {
 // =====================================================================================================================
 // Visit a call instruction.
 //
-// Peephole log2(const +/- x) -> log2(max(0.0, const +/- x)).
-// This addresses a potential precision underflow in applications intolerant to in-spec math reordering.
+// Peephole relevant argument to call such that const +/- x -> max(0.0, const +/- x)
+// where the argument is X for log2(X) or pow(X, Y).
+// This addresses a potential precision underflow in applications intolerant to
+// in-spec math reordering.
+// This has to be enabled per app or shader based on forceUnderflowPrevention option.
 //
 // @param callInst: The call instruction to visit.
-void PeepholeOptimization::visitCallInst(CallInst &callInst) {
-  if (callInst.getIntrinsicID() != Intrinsic::log2)
+void PeepholeOptimizer::visitCallInst(CallInst &callInst) {
+  // Only apply this peephole when explicitly requested via option
+  if (!(m_shaderOptions && m_shaderOptions->forceUnderflowPrevention))
     return;
 
+  switch (callInst.getIntrinsicID()) {
+  case Intrinsic::log2:
+  case Intrinsic::pow:
+    break;
+  default:
+    return;
+  }
+
   Value *V = callInst.getOperand(0);
 
   if (!(match(V, m_FSub(m_Constant(), m_Value())) || match(V, m_FSub(m_Value(), m_Constant())) ||
@@ -167,5 +219,3 @@ void PeepholeOptimization::visitCallInst(CallInst &callInst) {
 
   m_changed = true;
 }
-
-} // namespace lgc
diff --git a/lgc/patch/PreparePipelineAbi.cpp b/lgc/patch/PreparePipelineAbi.cpp
index 1642e68323..7b35da7269 100644
--- a/lgc/patch/PreparePipelineAbi.cpp
+++ b/lgc/patch/PreparePipelineAbi.cpp
@@ -88,7 +88,7 @@ PreservedAnalyses PreparePipelineAbi::run(Module &module, ModuleAnalysisManager
   m_gfxIp = m_pipelineState->getTargetInfo().getGfxIpVersion();
 
   if (auto hsEntryPoint = m_pipelineShaders->getEntryPoint(ShaderStage::TessControl))
-    storeTessFactors(hsEntryPoint);
+    storeTessFactorsAndHsOutputs(hsEntryPoint);
 
   mergeShader(module);
 
@@ -112,6 +112,8 @@ std::pair<Value *, Value *> PreparePipelineAbi::readTessFactors(PipelineState *p
   auto func = builder.GetInsertBlock()->getParent();
   auto lds = Patch::getLdsVariable(pipelineState, func);
 
+  const auto &hwConfig = pipelineState->getShaderResourceUsage(ShaderStage::TessControl)->inOutUsage.tcs.hwConfig;
+
   // Helper to read value from LDS
   auto readValueFromLds = [&](Type *readTy, Value *ldsOffset) {
     assert(readTy->getScalarSizeInBits() == 32); // Only accept 32-bit data
@@ -144,8 +146,9 @@ std::pair<Value *, Value *> PreparePipelineAbi::readTessFactors(PipelineState *p
   }
 
   assert(numOuterTfs >= 2 && numOuterTfs <= 4);
-  // ldsOffset = relativeId * MaxTessFactorsPerPatch
-  Value *ldsOffset = builder.CreateMul(relPatchId, builder.getInt32(MaxTessFactorsPerPatch));
+  // ldsOffset = tessFactorStart + relPatchId * tessFactorStride
+  Value *ldsOffset = builder.CreateMul(relPatchId, builder.getInt32(hwConfig.onChip.tessFactorStride));
+  ldsOffset = builder.CreateAdd(ldsOffset, builder.getInt32(hwConfig.onChip.tessFactorStart));
   Value *outerTf = readValueFromLds(FixedVectorType::get(builder.getFloatTy(), numOuterTfs), ldsOffset);
 
   // NOTE: For isoline, the outer tessellation factors have to be exchanged, which is required by HW.
@@ -157,9 +160,10 @@ std::pair<Value *, Value *> PreparePipelineAbi::readTessFactors(PipelineState *p
   assert(numInnerTfs <= 2);
   Value *innerTf = nullptr;
   if (numInnerTfs > 0) {
-    // ldsOffset = relativeId * MaxTessFactorsPerPatch + 4
-    Value *ldsOffset = builder.CreateMul(relPatchId, builder.getInt32(MaxTessFactorsPerPatch));
-    ldsOffset = builder.CreateAdd(ldsOffset, builder.getInt32(4));
+    // ldsOffset = tessFactorStart + relPatchId * tessFactorStride + numOuterTfs
+    Value *ldsOffset = builder.CreateMul(relPatchId, builder.getInt32(hwConfig.onChip.tessFactorStride));
+    ldsOffset = builder.CreateAdd(ldsOffset, builder.getInt32(hwConfig.onChip.tessFactorStart));
+    ldsOffset = builder.CreateAdd(ldsOffset, builder.getInt32(numOuterTfs));
     innerTf = readValueFromLds(FixedVectorType::get(builder.getFloatTy(), numInnerTfs), ldsOffset);
   }
 
@@ -177,7 +181,7 @@ std::pair<Value *, Value *> PreparePipelineAbi::readTessFactors(PipelineState *p
 // @param innerTf : Inner tessellation factors to write to TF buffer
 // @param builder : IR builder to insert instructions
 void PreparePipelineAbi::writeTessFactors(PipelineState *pipelineState, Value *tfBufferDesc, Value *tfBufferBase,
-                                          Value *relPatchId, Value *outerTf, Value *innerTf, IRBuilder<> &builder) {
+                                          Value *relPatchId, Value *outerTf, Value *innerTf, BuilderBase &builder) {
   // NOTE: Tessellation factors are from tessellation level array and we have:
   //   Isoline:
   //     TF[0] = outerTF[0]
@@ -194,40 +198,25 @@ void PreparePipelineAbi::writeTessFactors(PipelineState *pipelineState, Value *t
   //     TF[3] = outerTF[3]
   //     TF[4] = innerTF[0]
   //     TF[5] = innerTF[1]
-  const auto &calcFactor = pipelineState->getShaderResourceUsage(ShaderStage::TessControl)->inOutUsage.tcs.calcFactor;
-  Value *tfBufferOffset = builder.CreateMul(relPatchId, builder.getInt32(calcFactor.tessFactorStride * sizeof(float)));
-
-  CoherentFlag coherent = {};
-  if (pipelineState->getTargetInfo().getGfxIpVersion().major <= 11) {
-    coherent.bits.glc = true;
-  }
-
   const auto numOuterTfs = cast<FixedVectorType>(outerTf->getType())->getNumElements();
   const auto numInnerTfs = innerTf ? cast<FixedVectorType>(innerTf->getType())->getNumElements()
                                    : 0; // Isoline doesn't have inner tessellation factors
 
-  (void(numOuterTfs)); // Unused
-  (void(numInnerTfs));
-
-  auto bufferFormatX2 = BUF_NUM_FORMAT_FLOAT << 4 | BUF_DATA_FORMAT_32_32;
-  auto bufferFormatX4 = BUF_NUM_FORMAT_FLOAT << 4 | BUF_DATA_FORMAT_32_32_32_32;
-  if (pipelineState->getTargetInfo().getGfxIpVersion().major == 10) {
-    bufferFormatX2 = BUF_FORMAT_32_32_FLOAT_GFX10;
-    bufferFormatX4 = BUF_FORMAT_32_32_32_32_FLOAT_GFX10;
-  } else if (pipelineState->getTargetInfo().getGfxIpVersion().major >= 11) {
-    bufferFormatX2 = BUF_FORMAT_32_32_FLOAT_GFX11;
-    bufferFormatX4 = BUF_FORMAT_32_32_32_32_FLOAT_GFX11;
+  Value *tfBufferOffset = builder.CreateMul(relPatchId, builder.getInt32((numOuterTfs + numInnerTfs) * sizeof(float)));
+
+  CoherentFlag coherent = {};
+  if (pipelineState->getTargetInfo().getGfxIpVersion().major <= 11) {
+    coherent.bits.glc = true;
   }
 
   auto primitiveMode = pipelineState->getShaderModes()->getTessellationMode().primitiveMode;
   if (primitiveMode == PrimitiveMode::Isolines) {
     assert(numOuterTfs == 2 && numInnerTfs == 0);
-    builder.CreateIntrinsic(builder.getVoidTy(), Intrinsic::amdgcn_raw_tbuffer_store,
+    builder.CreateIntrinsic(builder.getVoidTy(), Intrinsic::amdgcn_raw_buffer_store,
                             {outerTf,                             // vdata
                              tfBufferDesc,                        // rsrc
                              tfBufferOffset,                      // voffset
                              tfBufferBase,                        // soffset
-                             builder.getInt32(bufferFormatX2),    // format
                              builder.getInt32(coherent.u32All)}); // glc
 
   } else if (primitiveMode == PrimitiveMode::Triangles) {
@@ -238,36 +227,251 @@ void PreparePipelineAbi::writeTessFactors(PipelineState *pipelineState, Value *t
     tessFactor =
         builder.CreateInsertElement(tessFactor, builder.CreateExtractElement(innerTf, static_cast<uint64_t>(0)), 3);
 
-    builder.CreateIntrinsic(builder.getVoidTy(), Intrinsic::amdgcn_raw_tbuffer_store,
+    builder.CreateIntrinsic(builder.getVoidTy(), Intrinsic::amdgcn_raw_buffer_store,
                             {tessFactor,                          // vdata
                              tfBufferDesc,                        // rsrc
                              tfBufferOffset,                      // voffset
                              tfBufferBase,                        // soffset
-                             builder.getInt32(bufferFormatX4),    // format
                              builder.getInt32(coherent.u32All)}); // glc
   } else {
     assert(primitiveMode == PrimitiveMode::Quads);
     assert(numOuterTfs == 4 && numInnerTfs == 2);
 
-    builder.CreateIntrinsic(builder.getVoidTy(), Intrinsic::amdgcn_raw_tbuffer_store,
+    builder.CreateIntrinsic(builder.getVoidTy(), Intrinsic::amdgcn_raw_buffer_store,
                             {outerTf,                             // vdata
                              tfBufferDesc,                        // rsrc
                              tfBufferOffset,                      // voffset
                              tfBufferBase,                        // soffset
-                             builder.getInt32(bufferFormatX4),    // format
                              builder.getInt32(coherent.u32All)}); // glc
 
     tfBufferOffset = builder.CreateAdd(tfBufferOffset, builder.getInt32(4 * sizeof(float)));
-    builder.CreateIntrinsic(builder.getVoidTy(), Intrinsic::amdgcn_raw_tbuffer_store,
+    builder.CreateIntrinsic(builder.getVoidTy(), Intrinsic::amdgcn_raw_buffer_store,
                             {innerTf,                             // vdata
                              tfBufferDesc,                        // rsrc
                              tfBufferOffset,                      // voffset
                              tfBufferBase,                        // soffset
-                             builder.getInt32(bufferFormatX2),    // format
                              builder.getInt32(coherent.u32All)}); // glc
   }
 }
 
+// =====================================================================================================================
+// Write HS outputs to off-chip LDS buffer.
+//
+// @param pipelineState : Pipeline state
+// @param offChipLdsDesc : Off-chip LDS buffer descriptor
+// @param offChipLdsBase : Off-chip LDS buffer base offset
+// @param relPatchId : Relative patch ID (output patch ID in group)
+// @param vertexIdx : Vertex indexing (output control point ID)
+// @param outerTf : Outer tessellation factors to check (if any one is less than or equal to zero, discard the patch)
+// @param builder : IR builder to insert instructions
+void PreparePipelineAbi::writeHsOutputs(PipelineState *pipelineState, Value *offChipLdsDesc, Value *offChipLdsBase,
+                                        Value *relPatchId, Value *vertexIdx, Value *outerTf, BuilderBase &builder) {
+  IRBuilder<>::InsertPointGuard guard(builder);
+
+  auto func = builder.GetInsertBlock()->getParent();
+  auto lds = Patch::getLdsVariable(pipelineState, func);
+
+  // Helper to read value from LDS
+  auto readValueFromLds = [&](Type *readTy, Value *ldsOffset) {
+    assert(readTy->getScalarSizeInBits() == 32); // Only accept 32-bit data
+
+    Value *readPtr = builder.CreateGEP(builder.getInt32Ty(), lds, ldsOffset);
+    readPtr = builder.CreateBitCast(readPtr, PointerType::get(readTy, readPtr->getType()->getPointerAddressSpace()));
+    return builder.CreateAlignedLoad(readTy, readPtr, Align(4));
+  };
+
+  //
+  // Check if this patch could be discarded
+  //
+  Value *minOuterTf = builder.CreateExtractElement(outerTf, static_cast<uint64_t>(0));
+  for (unsigned i = 1; i < cast<FixedVectorType>(outerTf->getType())->getNumElements(); ++i)
+    minOuterTf = builder.CreateBinaryIntrinsic(Intrinsic::minnum, minOuterTf, builder.CreateExtractElement(outerTf, i));
+
+  auto validPatch = builder.CreateFCmpOGT(minOuterTf, ConstantFP::get(builder.getFloatTy(), 0.0)); // minOuterTf > 0.0
+  builder.CreateIf(validPatch, false, ".writeHsOutputs");
+
+  //
+  // Write HS outputs to off-chip LDS buffer if this patch is valid
+  //
+  auto &inOutUsage = pipelineState->getShaderResourceUsage(ShaderStage::TessControl)->inOutUsage;
+  const auto &builtInUsage = pipelineState->getShaderResourceUsage(ShaderStage::TessControl)->builtInUsage.tcs;
+  const auto &hwConfig = inOutUsage.tcs.hwConfig;
+
+  // Check if we don't need to write this built-in to off-chip LDS buffer because it is only accessed by HS
+  auto checkBuiltInNotToWrite = [&](unsigned builtIn) {
+    if (pipelineState->getNextShaderStage(ShaderStage::TessControl) == ShaderStage::TessEval) {
+      auto nextInOutStage = pipelineState->getShaderResourceUsage(ShaderStage::TessEval)->inOutUsage;
+      if (builtIn == BuiltInTessLevelOuter || builtIn == BuiltInTessLevelInner) {
+        if (inOutUsage.perPatchBuiltInOutputLocMap.count(builtIn) > 0 &&
+            nextInOutStage.perPatchBuiltInInputLocMap.count(builtIn) == 0)
+          return true;
+      } else {
+        if (inOutUsage.builtInOutputLocMap.count(builtIn) > 0 && nextInOutStage.builtInInputLocMap.count(builtIn) == 0)
+          return true;
+      }
+    }
+    return false;
+  };
+
+  static const unsigned BufferFormatsGfx10[] = {BUF_FORMAT_32_FLOAT, BUF_FORMAT_32_32_FLOAT_GFX10,
+                                                BUF_FORMAT_32_32_32_FLOAT_GFX10, BUF_FORMAT_32_32_32_32_FLOAT_GFX10};
+  static const unsigned BufferFormatsGfx11[] = {BUF_FORMAT_32_FLOAT, BUF_FORMAT_32_32_FLOAT_GFX11,
+                                                BUF_FORMAT_32_32_32_FLOAT_GFX11, BUF_FORMAT_32_32_32_32_FLOAT_GFX11};
+
+  const auto gfxIp = pipelineState->getTargetInfo().getGfxIpVersion();
+  ArrayRef<unsigned> bufferFormats(gfxIp.major == 10 ? BufferFormatsGfx10 : BufferFormatsGfx11);
+  CoherentFlag coherent = {};
+  if (gfxIp.major <= 11) {
+    coherent.bits.glc = true;
+  }
+
+  // Write per-vertex HS outputs to off-chip LDS buffer
+  if (inOutUsage.outputMapLocCount > 0) {
+    SmallDenseSet<unsigned> builtInLocsNotToWrite;
+    SmallDenseMap<unsigned, Type *> builtInLocsToTypes;
+
+    for (const auto &[builtIn, loc] : inOutUsage.builtInOutputLocMap) {
+      if (checkBuiltInNotToWrite(builtIn)) {
+        assert(inOutUsage.builtInOutputLocMap.count(builtIn) > 0);
+        builtInLocsNotToWrite.insert(inOutUsage.builtInOutputLocMap[builtIn]);
+      } else {
+        switch (builtIn) {
+        case BuiltInPosition:
+          builtInLocsToTypes[loc] = FixedVectorType::get(builder.getFloatTy(), 4);
+          break;
+        case BuiltInPointSize:
+          builtInLocsToTypes[loc] = builder.getFloatTy();
+          break;
+        case BuiltInClipDistance:
+        case BuiltInCullDistance: {
+          const unsigned clipOrCullDistance =
+              builtIn == BuiltInClipDistance ? builtInUsage.clipDistance : builtInUsage.cullDistance;
+          assert(clipOrCullDistance > 0 && clipOrCullDistance <= 8);
+
+          builtInLocsToTypes[loc] = clipOrCullDistance == 1
+                                        ? builder.getFloatTy()
+                                        : FixedVectorType::get(builder.getFloatTy(), std::min(clipOrCullDistance, 4U));
+          if (clipOrCullDistance > 4) {
+            builtInLocsToTypes[loc + 1] = clipOrCullDistance == 5
+                                              ? builder.getFloatTy()
+                                              : FixedVectorType::get(builder.getFloatTy(), clipOrCullDistance - 4);
+          }
+
+          break;
+        }
+        case BuiltInViewportIndex:
+        case BuiltInLayer:
+          builtInLocsToTypes[loc] = builder.getInt32Ty();
+          break;
+        default:
+          llvm_unreachable("Unexpected built-in");
+          break;
+        }
+      }
+    }
+
+    // baseOffset = outputPatchStart + (relPatchId * outputVertexCount + vertexIdx) * outputVertexStride +
+    //            = outputPatchStart + relPatchId * outputPatchSize + vertexIdx * outputVertexStride
+    auto onChipLdsBaseOffset = builder.CreateMul(relPatchId, builder.getInt32(hwConfig.onChip.outputPatchSize));
+    onChipLdsBaseOffset = builder.CreateAdd(
+        onChipLdsBaseOffset, builder.CreateMul(vertexIdx, builder.getInt32(hwConfig.onChip.outputVertexStride)));
+    onChipLdsBaseOffset = builder.CreateAdd(onChipLdsBaseOffset, builder.getInt32(hwConfig.onChip.outputPatchStart));
+
+    auto offChipLdsBaseOffset = builder.CreateMul(relPatchId, builder.getInt32(hwConfig.offChip.outputPatchSize));
+    offChipLdsBaseOffset = builder.CreateAdd(
+        offChipLdsBaseOffset, builder.CreateMul(vertexIdx, builder.getInt32(hwConfig.offChip.outputVertexStride)));
+    offChipLdsBaseOffset = builder.CreateAdd(offChipLdsBaseOffset, builder.getInt32(hwConfig.offChip.outputPatchStart));
+
+    for (unsigned loc = 0; loc < inOutUsage.outputMapLocCount; ++loc) {
+      if (builtInLocsNotToWrite.count(loc) > 0)
+        continue;
+
+      Type *outputTy = FixedVectorType::get(builder.getInt32Ty(), 4); // <4 x i32> for generic outputs
+      if (builtInLocsToTypes.count(loc) > 0)
+        outputTy = builtInLocsToTypes[loc]; // Built-in outputs have known types
+
+      const unsigned numComponents = outputTy->isVectorTy() ? cast<FixedVectorType>(outputTy)->getNumElements() : 1;
+
+      // ldsOffset = baseOffset + attribOffset
+      auto attribOffset = builder.getInt32(4 * loc);
+      auto onChipLdsOffset = builder.CreateAdd(onChipLdsBaseOffset, attribOffset);
+      auto output = readValueFromLds(outputTy, onChipLdsOffset);
+
+      auto offChipLdsOffset = builder.CreateAdd(offChipLdsBaseOffset, attribOffset);
+      offChipLdsOffset = builder.CreateMul(offChipLdsOffset, builder.getInt32(4)); // Convert to byte offset
+
+      builder.CreateIntrinsic(builder.getVoidTy(), Intrinsic::amdgcn_raw_tbuffer_store,
+                              {output,                                             // vdata
+                               offChipLdsDesc,                                     // rsrc
+                               offChipLdsOffset,                                   // voffset
+                               offChipLdsBase,                                     // soffset
+                               builder.getInt32(bufferFormats[numComponents - 1]), // format
+                               builder.getInt32(coherent.u32All)});                // glc
+    }
+  }
+
+  // Write per-patch HS outputs to off-chip LDS buffer
+  if (inOutUsage.perPatchOutputMapLocCount > 0) {
+    SmallDenseSet<unsigned> builtInLocsNotToWrite;
+    SmallDenseMap<unsigned, Type *> builtInLocsToTypes;
+
+    for (const auto &[builtIn, loc] : inOutUsage.perPatchBuiltInOutputLocMap) {
+      if (checkBuiltInNotToWrite(builtIn)) {
+        assert(inOutUsage.perPatchBuiltInOutputLocMap.count(builtIn) > 0);
+        builtInLocsNotToWrite.insert(inOutUsage.perPatchBuiltInOutputLocMap[builtIn]);
+      } else {
+        Type *type = nullptr;
+        switch (builtIn) {
+        case BuiltInTessLevelOuter:
+          type = FixedVectorType::get(builder.getFloatTy(), 4);
+          break;
+        case BuiltInTessLevelInner:
+          type = FixedVectorType::get(builder.getFloatTy(), 2);
+          break;
+        default:
+          llvm_unreachable("Unexpected built-in");
+          break;
+        }
+        builtInLocsToTypes[loc] = type;
+      }
+    }
+
+    // baseOffset = patchConstStart + relPatchId * patchConstSize
+    auto onChipLdsBaseOffset = builder.CreateMul(relPatchId, builder.getInt32(hwConfig.onChip.patchConstSize));
+    onChipLdsBaseOffset = builder.CreateAdd(onChipLdsBaseOffset, builder.getInt32(hwConfig.onChip.patchConstStart));
+
+    auto offChipLdsBaseOffset = builder.CreateMul(relPatchId, builder.getInt32(hwConfig.offChip.patchConstSize));
+    offChipLdsBaseOffset = builder.CreateAdd(offChipLdsBaseOffset, builder.getInt32(hwConfig.offChip.patchConstStart));
+
+    for (unsigned loc = 0; loc < inOutUsage.perPatchOutputMapLocCount; ++loc) {
+      if (builtInLocsNotToWrite.count(loc) > 0)
+        continue;
+
+      Type *outputTy = FixedVectorType::get(builder.getInt32Ty(), 4); // <4 x i32> for generic outputs
+      if (builtInLocsToTypes.count(loc) > 0)
+        outputTy = builtInLocsToTypes[loc]; // Built-in outputs have known types
+
+      const unsigned numComponents = outputTy->isVectorTy() ? cast<FixedVectorType>(outputTy)->getNumElements() : 1;
+
+      // ldsOffset = baseOffset + attribOffset
+      auto attribOffset = builder.getInt32(4 * loc);
+      auto onChipLdsOffset = builder.CreateAdd(onChipLdsBaseOffset, attribOffset);
+      auto output = readValueFromLds(outputTy, onChipLdsOffset);
+
+      auto offChipLdsOffset = builder.CreateAdd(offChipLdsBaseOffset, attribOffset);
+      offChipLdsOffset = builder.CreateMul(offChipLdsOffset, builder.getInt32(4)); // Convert to byte offset
+
+      builder.CreateIntrinsic(builder.getVoidTy(), Intrinsic::amdgcn_raw_tbuffer_store,
+                              {output,                                             // vdata
+                               offChipLdsDesc,                                     // rsrc
+                               offChipLdsOffset,                                   // voffset
+                               offChipLdsBase,                                     // soffset
+                               builder.getInt32(bufferFormats[numComponents - 1]), // format
+                               builder.getInt32(coherent.u32All)});                // glc
+    }
+  }
+}
+
 // =====================================================================================================================
 // Merge shaders and set calling convention for the entry-point of each shader (GFX9+)
 //
@@ -419,10 +623,11 @@ void PreparePipelineAbi::addAbiMetadata(Module &module) {
 }
 
 // =====================================================================================================================
-// Handle the store of tessellation factors.
+// Handle the store of tessellation factors (TFs) and the store of HS outputs to off-chip LDS buffer if the patch is
+// valid (all of its outer TFs are greater than zero).
 //
 // @param entryPoint : Entry-point of tessellation control shader
-void PreparePipelineAbi::storeTessFactors(Function *entryPoint) {
+void PreparePipelineAbi::storeTessFactorsAndHsOutputs(Function *entryPoint) {
   assert(getShaderStage(entryPoint) == ShaderStage::TessControl); // Must be tessellation control shader
 
   if (m_pipelineState->canOptimizeTessFactor())
@@ -439,21 +644,26 @@ void PreparePipelineAbi::storeTessFactors(Function *entryPoint) {
   }
   assert(retInst); // Must have return instruction
 
-  IRBuilder<> builder(*m_context);
+  BuilderBase builder(*m_context);
   builder.SetInsertPoint(retInst);
 
   PipelineSystemValues pipelineSysValues;
   pipelineSysValues.initialize(m_pipelineState);
 
   const auto tfBufferDesc = pipelineSysValues.get(entryPoint)->getTessFactorBufDesc();
+  const auto offChipLdsDesc = pipelineSysValues.get(entryPoint)->getOffChipLdsDesc();
   const auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(ShaderStage::TessControl)->entryArgIdxs.tcs;
   const auto tfBufferBase = getFunctionArgument(entryPoint, entryArgIdxs.tfBufferBase);
+  const auto offChipLdsBase = getFunctionArgument(entryPoint, entryArgIdxs.offChipLdsBase);
   const auto relPatchId = pipelineSysValues.get(entryPoint)->getRelativeId();
+  const auto vertexIdx = pipelineSysValues.get(entryPoint)->getInvocationId();
 
   // Read back tessellation factors and write them to TF buffer
-  auto tessFactors = readTessFactors(m_pipelineState, relPatchId, builder);
-  writeTessFactors(m_pipelineState, tfBufferDesc, tfBufferBase, relPatchId, tessFactors.first, tessFactors.second,
-                   builder);
+  const auto &[outerTf, innerTf] = readTessFactors(m_pipelineState, relPatchId, builder);
+  writeTessFactors(m_pipelineState, tfBufferDesc, tfBufferBase, relPatchId, outerTf, innerTf, builder);
+
+  // Write HS outputs to off-chip LDS buffer
+  writeHsOutputs(m_pipelineState, offChipLdsDesc, offChipLdsBase, relPatchId, vertexIdx, outerTf, builder);
 
   pipelineSysValues.clear();
 }
diff --git a/lgc/patch/RegisterMetadataBuilder.cpp b/lgc/patch/RegisterMetadataBuilder.cpp
index 87ecf557de..cc77461ed5 100644
--- a/lgc/patch/RegisterMetadataBuilder.cpp
+++ b/lgc/patch/RegisterMetadataBuilder.cpp
@@ -191,9 +191,9 @@ void RegisterMetadataBuilder::buildLsHsRegisters() {
   getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::VgtHosMaxTessLevel] = maxTessFactor;
 
   // VGT_LS_HS_CONFIG
-  const auto &calcFactor = m_pipelineState->getShaderResourceUsage(ShaderStage::TessControl)->inOutUsage.tcs.calcFactor;
+  const auto &hwConfig = m_pipelineState->getShaderResourceUsage(ShaderStage::TessControl)->inOutUsage.tcs.hwConfig;
   auto vgtLsHsConfig = getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::VgtLsHsConfig].getMap(true);
-  vgtLsHsConfig[Util::Abi::VgtLsHsConfigMetadataKey::NumPatches] = calcFactor.patchCountPerThreadGroup;
+  vgtLsHsConfig[Util::Abi::VgtLsHsConfigMetadataKey::NumPatches] = hwConfig.maxNumPatchesPerGroup;
   vgtLsHsConfig[Util::Abi::VgtLsHsConfigMetadataKey::HsNumInputCp] = m_pipelineState->getNumPatchControlPoints();
   vgtLsHsConfig[Util::Abi::VgtLsHsConfigMetadataKey::HsNumOutputCp] =
       m_pipelineState->getShaderModes()->getTessellationMode().outputVertices;
@@ -215,8 +215,8 @@ void RegisterMetadataBuilder::buildLsHsRegisters() {
   getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::LsVgprCompCnt] = lsVgprCompCnt;
 
   // Set LDS_SIZE of SPI_SHADER_PGM_RSRC2_HS
-  unsigned ldsSizeInDwords = calcFactor.tessOnChipLdsSize;
-  ldsSizeInDwords += calcFactor.rayQueryLdsStackSize;
+  unsigned ldsSizeInDwords = hwConfig.tessOnChipLdsSize;
+  ldsSizeInDwords += hwConfig.rayQueryLdsStackSize;
 
   auto hwShaderNode = getHwShaderNode(Util::Abi::HardwareStage::Hs);
   hwShaderNode[Util::Abi::HardwareStageMetadataKey::LdsSize] = calcLdsSize(ldsSizeInDwords);
@@ -238,19 +238,19 @@ void RegisterMetadataBuilder::buildEsGsRegisters() {
   const auto &gsBuiltInUsage = gsResUsage->builtInUsage.gs;
   const auto &gsInOutUsage = gsResUsage->inOutUsage;
   const auto &geometryMode = m_pipelineState->getShaderModes()->getGeometryShaderMode();
-  const auto &calcFactor = gsInOutUsage.gs.calcFactor;
+  const auto &hwConfig = gsInOutUsage.gs.hwConfig;
   const auto tesResUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::TessEval);
   const auto &tesBuiltInUsage = tesResUsage->builtInUsage.tes;
   const bool hasTs = m_hasTcs || m_hasTes;
 
   // ES_VGPR_COMP_CNT in SPI_SHADER_PGM_RSRC2_GS
   unsigned gsVgprCompCnt = 0;
-  if ((calcFactor.inputVertices > 4 && geometryMode.inputPrimitive != InputPrimitives::Patch) ||
+  if ((hwConfig.inputVertices > 4 && geometryMode.inputPrimitive != InputPrimitives::Patch) ||
       gsBuiltInUsage.invocationId)
     gsVgprCompCnt = 3; // Enable vtx4/vtx5 offset (GS VGPR3) or GS instance ID (GS VGPR4)
   else if (gsBuiltInUsage.primitiveIdIn)
     gsVgprCompCnt = 2; // Enable primitive ID (GS VGPR2)
-  else if (calcFactor.inputVertices > 2 && geometryMode.inputPrimitive != InputPrimitives::Patch)
+  else if (hwConfig.inputVertices > 2 && geometryMode.inputPrimitive != InputPrimitives::Patch)
     gsVgprCompCnt = 1; // Enable vtx2/vtx3 offset (GS VGPR1)
   getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::GsVgprCompCnt] = gsVgprCompCnt;
 
@@ -295,12 +295,12 @@ void RegisterMetadataBuilder::buildEsGsRegisters() {
 
   // VGT_GS_ONCHIP_CNTL
   auto vgtGsOnChipCntl = getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::VgtGsOnchipCntl].getMap(true);
-  vgtGsOnChipCntl[Util::Abi::VgtGsOnchipCntlMetadataKey::EsVertsPerSubgroup] = calcFactor.esVertsPerSubgroup;
-  vgtGsOnChipCntl[Util::Abi::VgtGsOnchipCntlMetadataKey::GsPrimsPerSubgroup] = calcFactor.gsPrimsPerSubgroup;
+  vgtGsOnChipCntl[Util::Abi::VgtGsOnchipCntlMetadataKey::EsVertsPerSubgroup] = hwConfig.esVertsPerSubgroup;
+  vgtGsOnChipCntl[Util::Abi::VgtGsOnchipCntlMetadataKey::GsPrimsPerSubgroup] = hwConfig.gsPrimsPerSubgroup;
   // NOTE: The value of field "GS_INST_PRIMS_IN_SUBGRP" should be strictly equal to the product of
   // VGT_GS_ONCHIP_CNTL.GS_PRIMS_PER_SUBGRP * VGT_GS_INSTANCE_CNT.CNT.
   const unsigned gsInstPrimsInSubgrp =
-      geometryMode.invocations > 1 ? (calcFactor.gsPrimsPerSubgroup * geometryMode.invocations) : 0;
+      geometryMode.invocations > 1 ? (hwConfig.gsPrimsPerSubgroup * geometryMode.invocations) : 0;
   vgtGsOnChipCntl[Util::Abi::VgtGsOnchipCntlMetadataKey::GsInstPrimsPerSubgrp] = gsInstPrimsInSubgrp;
 
   // VGT_GS_VERT_ITEMSIZE and VGT_GSVS_RING_OFFSET
@@ -311,7 +311,7 @@ void RegisterMetadataBuilder::buildEsGsRegisters() {
   const unsigned itemCount = 4;
   unsigned gsVsRingOffset = 0;
   for (unsigned i = 0; i < itemCount; ++i) {
-    unsigned itemSize = gsInOutUsage.gs.calcFactor.gsVsVertexItemSize[i];
+    unsigned itemSize = gsInOutUsage.gs.hwConfig.gsVsVertexItemSize[i];
     itemSizeArrayNode[i] = itemSize;
     if (i < itemCount - 1) {
       gsVsRingOffset += itemSize * maxVertOut;
@@ -352,10 +352,10 @@ void RegisterMetadataBuilder::buildEsGsRegisters() {
   }
 
   // VGT_GSVS_RING_ITEMSIZE
-  getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::VgtGsvsRingItemsize] = calcFactor.gsVsRingItemSize;
+  getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::VgtGsvsRingItemsize] = hwConfig.gsVsRingItemSize;
 
   // VGT_ESGS_RING_ITEMSIZE
-  getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::VgtEsgsRingItemsize] = calcFactor.esGsRingItemSize;
+  getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::VgtEsgsRingItemsize] = hwConfig.esGsRingItemSize;
 
   // VGT_LS_HS_CONFIG
   if (geometryMode.inputPrimitive == InputPrimitives::Patch) {
@@ -369,8 +369,8 @@ void RegisterMetadataBuilder::buildEsGsRegisters() {
   getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::MaxVertsPerSubgroup] = maxPrimsPerSubgroup;
 
   // Set LDS_SIZE of SPI_SHADER_PGM_RSRC2_GS
-  unsigned ldsSizeInDwords = calcFactor.gsOnChipLdsSize;
-  ldsSizeInDwords += calcFactor.rayQueryLdsStackSize;
+  unsigned ldsSizeInDwords = hwConfig.gsOnChipLdsSize;
+  ldsSizeInDwords += hwConfig.rayQueryLdsStackSize;
 
   auto hwShaderNode = getHwShaderNode(Util::Abi::HardwareStage::Gs);
   hwShaderNode[Util::Abi::HardwareStageMetadataKey::LdsSize] = calcLdsSize(ldsSizeInDwords);
@@ -388,7 +388,7 @@ void RegisterMetadataBuilder::buildPrimShaderRegisters() {
   const auto &gsBuiltInUsage = gsResUsage->builtInUsage.gs;
   const auto &geometryMode = m_pipelineState->getShaderModes()->getGeometryShaderMode();
   const auto &gsInOutUsage = gsResUsage->inOutUsage;
-  const auto &calcFactor = gsInOutUsage.gs.calcFactor;
+  const auto &hwConfig = gsInOutUsage.gs.hwConfig;
   const auto meshResUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::Mesh);
   const auto &meshBuiltInUsage = meshResUsage->builtInUsage.mesh;
   const auto &meshMode = m_pipelineState->getShaderModes()->getMeshShaderMode();
@@ -399,12 +399,12 @@ void RegisterMetadataBuilder::buildPrimShaderRegisters() {
   unsigned gsVgprCompCnt = 0;
   if (m_gfxIp.major <= 11) {
     if (m_hasGs) {
-      if ((calcFactor.inputVertices > 4 && geometryMode.inputPrimitive != InputPrimitives::Patch) ||
+      if ((hwConfig.inputVertices > 4 && geometryMode.inputPrimitive != InputPrimitives::Patch) ||
           gsBuiltInUsage.invocationId)
         gsVgprCompCnt = 3; // Enable vtx4/vtx5 offset (GS VGPR3) or GS instance ID (GS VGPR4)
       else if (gsBuiltInUsage.primitiveIdIn)
         gsVgprCompCnt = 2; // Enable primitive ID (GS VGPR2)
-      else if (calcFactor.inputVertices > 2 && geometryMode.inputPrimitive != InputPrimitives::Patch)
+      else if (hwConfig.inputVertices > 2 && geometryMode.inputPrimitive != InputPrimitives::Patch)
         gsVgprCompCnt = 1; // Enable vtx2/vtx3 offset (GS VGPR1)
     } else if (m_hasVs) {
       // NOTE: When GS is absent, only those VGPRs are required: vtx0/vtx1 offset, vtx2/vtx3 offset,
@@ -452,13 +452,13 @@ void RegisterMetadataBuilder::buildPrimShaderRegisters() {
 
   // VGT_GS_ONCHIP_CNTL
   auto vgtGsOnchipCntl = getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::VgtGsOnchipCntl].getMap(true);
-  vgtGsOnchipCntl[Util::Abi::VgtGsOnchipCntlMetadataKey::EsVertsPerSubgroup] = calcFactor.esVertsPerSubgroup;
-  vgtGsOnchipCntl[Util::Abi::VgtGsOnchipCntlMetadataKey::GsPrimsPerSubgroup] = calcFactor.gsPrimsPerSubgroup;
+  vgtGsOnchipCntl[Util::Abi::VgtGsOnchipCntlMetadataKey::EsVertsPerSubgroup] = hwConfig.esVertsPerSubgroup;
+  vgtGsOnchipCntl[Util::Abi::VgtGsOnchipCntlMetadataKey::GsPrimsPerSubgroup] = hwConfig.gsPrimsPerSubgroup;
 
   unsigned gsInstPrimsInSubgrp = 1;
   if (!m_hasMesh) {
-    gsInstPrimsInSubgrp = geometryMode.invocations > 1 ? (calcFactor.gsPrimsPerSubgroup * geometryMode.invocations)
-                                                       : calcFactor.gsPrimsPerSubgroup;
+    gsInstPrimsInSubgrp = geometryMode.invocations > 1 ? (hwConfig.gsPrimsPerSubgroup * geometryMode.invocations)
+                                                       : hwConfig.gsPrimsPerSubgroup;
   }
   vgtGsOnchipCntl[Util::Abi::VgtGsOnchipCntlMetadataKey::GsInstPrimsPerSubgrp] = gsInstPrimsInSubgrp;
 
@@ -533,13 +533,13 @@ void RegisterMetadataBuilder::buildPrimShaderRegisters() {
   vgtGsOutPrimType[Util::Abi::VgtGsOutPrimTypeMetadataKey::OutprimType] =
       m_pipelineState->getPalMetadata()->serializeEnum(Util::Abi::GsOutPrimType(gsOutputPrimitiveType));
 
-  assert(calcFactor.primAmpFactor >= 1);
+  assert(hwConfig.primAmpFactor >= 1);
   unsigned maxVertsPerSubgroup = NggMaxThreadsPerSubgroup;
   unsigned threadsPerSubgroup = NggMaxThreadsPerSubgroup;
   unsigned spiShaderIdsFormat = SPI_SHADER_1COMP;
   if (m_hasMesh) {
     maxVertsPerSubgroup = std::min(meshMode.outputVertices, NggMaxThreadsPerSubgroup);
-    threadsPerSubgroup = calcFactor.primAmpFactor;
+    threadsPerSubgroup = hwConfig.primAmpFactor;
     const bool enableMultiView = m_pipelineState->getInputAssemblyState().multiView != MultiViewMode::Disable;
     bool hasPrimitivePayload = meshBuiltInUsage.layer || meshBuiltInUsage.viewportIndex ||
                                meshBuiltInUsage.primitiveShadingRate || enableMultiView;
@@ -578,7 +578,7 @@ void RegisterMetadataBuilder::buildPrimShaderRegisters() {
       // group. Otherwise, it is set according to actual primitive amplification factor.
       const unsigned threadGroupSize = m_pipelineState->enableMeshRowExport()
                                            ? meshMode.workgroupSizeX * meshMode.workgroupSizeY * meshMode.workgroupSizeZ
-                                           : calcFactor.primAmpFactor;
+                                           : hwConfig.primAmpFactor;
       spiShaderGsMeshletDim[Util::Abi::SpiShaderGsMeshletDimMetadataKey::ThreadgroupSize] = threadGroupSize - 1;
 
       // SPI_SHADER_GS_MESHLET_EXP_ALLOC
@@ -599,14 +599,13 @@ void RegisterMetadataBuilder::buildPrimShaderRegisters() {
       vgtGsInstanceCnt[Util::Abi::VgtGsInstanceCntMetadataKey::Enable] = true;
       vgtGsInstanceCnt[Util::Abi::VgtGsInstanceCntMetadataKey::Count] = geometryMode.invocations;
       if (m_gfxIp >= GfxIpVersion{10, 1})
-        vgtGsInstanceCnt[Util::Abi::VgtGsInstanceCntMetadataKey::EnMaxVertOutPerGsInstance] =
-            calcFactor.enableMaxVertOut;
+        vgtGsInstanceCnt[Util::Abi::VgtGsInstanceCntMetadataKey::EnMaxVertOutPerGsInstance] = hwConfig.enableMaxVertOut;
     }
 
     if (m_gfxIp.major <= 11) {
       // VGT_ESGS_RING_ITEMSIZE
       getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::VgtEsgsRingItemsize] =
-          (m_hasGs ? calcFactor.esGsRingItemSize : 1);
+          (m_hasGs ? hwConfig.esGsRingItemSize : 1);
     }
 
     // VGT_LS_HS_CONFIG
@@ -630,7 +629,7 @@ void RegisterMetadataBuilder::buildPrimShaderRegisters() {
 
   // GE_NGG_SUBGRP_CNTL
   auto geNggSubgrpCntl = getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::GeNggSubgrpCntl].getMap(true);
-  geNggSubgrpCntl[Util::Abi::GeNggSubgrpCntlMetadataKey::PrimAmpFactor] = calcFactor.primAmpFactor;
+  geNggSubgrpCntl[Util::Abi::GeNggSubgrpCntlMetadataKey::PrimAmpFactor] = hwConfig.primAmpFactor;
   geNggSubgrpCntl[Util::Abi::GeNggSubgrpCntlMetadataKey::ThreadsPerSubgroup] = threadsPerSubgroup;
 
   // TODO: Support PIPELINE_PRIM_ID.
@@ -638,7 +637,7 @@ void RegisterMetadataBuilder::buildPrimShaderRegisters() {
   getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::SpiShaderIdxFormat] = spiShaderIdsFormat;
 
   // Pipeline metadata
-  setNggSubgroupSize(m_hasMesh ? 1 : std::max(calcFactor.esVertsPerSubgroup, calcFactor.gsPrimsPerSubgroup));
+  setNggSubgroupSize(m_hasMesh ? 1 : std::max(hwConfig.esVertsPerSubgroup, hwConfig.gsPrimsPerSubgroup));
 
   //
   // Build SW stream-out configuration (GFX11+)
@@ -655,8 +654,8 @@ void RegisterMetadataBuilder::buildPrimShaderRegisters() {
   }
 
   // Set LDS_SIZE of SPI_SHADER_PGM_RSRC2_GS
-  unsigned ldsSizeInDwords = calcFactor.gsOnChipLdsSize;
-  ldsSizeInDwords += calcFactor.rayQueryLdsStackSize;
+  unsigned ldsSizeInDwords = hwConfig.gsOnChipLdsSize;
+  ldsSizeInDwords += hwConfig.rayQueryLdsStackSize;
 
   auto hwShaderNode = getHwShaderNode(Util::Abi::HardwareStage::Gs);
   hwShaderNode[Util::Abi::HardwareStageMetadataKey::LdsSize] = calcLdsSize(ldsSizeInDwords);
@@ -689,7 +688,7 @@ void RegisterMetadataBuilder::buildHwVsRegisters() {
   vgtStrmoutConfig[Util::Abi::VgtStrmoutConfigMetadataKey::Streamout_3En] = enablePrimStats || streamXfbBuffers[3] > 0;
   if (shaderStage == ShaderStage::CopyShader) {
     unsigned rasterStream = m_pipelineState->getRasterizerState().rasterStream;
-    if (m_pipelineState->getRasterizerState().rasterStream == InvalidValue) {
+    if (rasterStream == InvalidValue) {
       // NOTE: According to HW register spec, rasterization stream has 3 bits, the lower 2 bits are programmed to stream
       // ID (0~3). If rasterization is not enabled for any stream, set the highest 1 bit to 1.
       static const unsigned NoRasterStream = 0x4;
diff --git a/lgc/patch/SetupTargetFeatures.cpp b/lgc/patch/SetupTargetFeatures.cpp
index 8d03f7ddbd..734b79ffef 100644
--- a/lgc/patch/SetupTargetFeatures.cpp
+++ b/lgc/patch/SetupTargetFeatures.cpp
@@ -25,7 +25,7 @@
 /**
 ***********************************************************************************************************************
 * @file  SetupTargetFeatures.cpp
-* @brief LLPC source file: contains declaration and implementation of class lgc::PatchSetupTargetFeatures.
+* @brief LLPC source file: contains declaration and implementation of class lgc::SetUpTargetFeatures.
 ***********************************************************************************************************************
 */
 #include "lgc/patch/SetupTargetFeatures.h"
@@ -35,7 +35,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 
-#define DEBUG_TYPE "lgc-patch-setup-target-features"
+#define DEBUG_TYPE "lgc-set-up-target-features"
 
 using namespace llvm;
 using namespace lgc;
@@ -46,7 +46,7 @@ using namespace lgc;
 // @param [in/out] module : LLVM module to be run on
 // @param [in/out] analysisManager : Analysis manager to use for this transformation
 // @returns : The preserved analyses (The analyses that are still valid after this pass)
-PreservedAnalyses PatchSetupTargetFeatures::run(Module &module, ModuleAnalysisManager &analysisManager) {
+PreservedAnalyses SetUpTargetFeatures::run(Module &module, ModuleAnalysisManager &analysisManager) {
   PipelineState *pipelineState = analysisManager.getResult<PipelineStateWrapper>(module).getPipelineState();
 
   LLVM_DEBUG(dbgs() << "Run the pass Patch-Setup-Target-Features\n");
@@ -78,7 +78,7 @@ PreservedAnalyses PatchSetupTargetFeatures::run(Module &module, ModuleAnalysisMa
 // Setup LLVM target features, target features are set per entry point function.
 //
 // @param [in/out] module : LLVM module
-void PatchSetupTargetFeatures::setupTargetFeatures(Module *module) {
+void SetUpTargetFeatures::setupTargetFeatures(Module *module) {
   std::string globalFeatures = "";
 
   if (m_pipelineState->getOptions().includeDisassembly)
diff --git a/lgc/patch/ShaderInputs.cpp b/lgc/patch/ShaderInputs.cpp
index 6cd24e1cb3..b447a41784 100644
--- a/lgc/patch/ShaderInputs.cpp
+++ b/lgc/patch/ShaderInputs.cpp
@@ -94,7 +94,7 @@ CallInst *ShaderInputs::getSpecialUserData(UserDataMapping kind, BuilderBase &bu
   else if (kind == UserDataMapping::MeshTaskDispatchDims)
     ty = FixedVectorType::get(builder.getInt32Ty(), 3);
   else if (kind == UserDataMapping::Workgroup)
-    ty = FixedVectorType::get(builder.getInt32Ty(), 3)->getPointerTo(ADDR_SPACE_CONST);
+    ty = builder.getPtrTy(ADDR_SPACE_CONST);
   return builder.CreateNamedCall((Twine(lgcName::SpecialUserData) + getSpecialUserDataName(kind)).str(), ty,
                                  builder.getInt32(static_cast<unsigned>(kind)), Attribute::ReadNone);
 }
@@ -103,10 +103,9 @@ CallInst *ShaderInputs::getSpecialUserData(UserDataMapping kind, BuilderBase &bu
 // Get a special user data value as a pointer by inserting a call to lgc.special.user.data then extending it
 //
 // @param kind : The kind of special user data, a UserDataMapping enum value
-// @param pointeeTy : Type that the pointer will point to
 // @param builder : Builder to insert the call with
-Value *ShaderInputs::getSpecialUserDataAsPointer(UserDataMapping kind, Type *pointeeTy, BuilderBase &builder) {
-  Type *pointerTy = pointeeTy->getPointerTo(ADDR_SPACE_CONST);
+Value *ShaderInputs::getSpecialUserDataAsPointer(UserDataMapping kind, BuilderBase &builder) {
+  Type *pointerTy = builder.getPtrTy(ADDR_SPACE_CONST);
   std::string callName = lgcName::SpecialUserData;
   callName += getSpecialUserDataName(kind);
   callName += ".";
@@ -114,7 +113,7 @@ Value *ShaderInputs::getSpecialUserDataAsPointer(UserDataMapping kind, Type *poi
   Value *userDataValue = builder.CreateNamedCall(
       (Twine(lgcName::SpecialUserData) + getSpecialUserDataName(kind)).str(), pointerTy,
       {builder.getInt32(static_cast<unsigned>(kind)), builder.getInt32(HighAddrPc)}, Attribute::ReadNone);
-  return builder.CreateIntToPtr(userDataValue, pointeeTy->getPointerTo(ADDR_SPACE_CONST));
+  return builder.CreateIntToPtr(userDataValue, pointerTy);
 }
 
 // =====================================================================================================================
diff --git a/lgc/patch/ShaderMerger.cpp b/lgc/patch/ShaderMerger.cpp
index 80001e0a0f..89f2c9a9be 100644
--- a/lgc/patch/ShaderMerger.cpp
+++ b/lgc/patch/ShaderMerger.cpp
@@ -470,11 +470,11 @@ Function *ShaderMerger::generateLsHsEntryPoint(Function *lsEntryPoint, Function
 
     // NOTE: The hsPatchCount is only valid for the first wave in the group. We have to store it to LDS to distribute
     // it through the group.
-    Value *hasPatchCount = builder.CreateLShr(mergeWaveInfo, 16); // hsWaveCount = mergedWaveInfo[24:16]
-    hasPatchCount = builder.CreateAnd(hasPatchCount, 0xFF);
+    Value *hsPatchCount = builder.CreateLShr(mergeWaveInfo, 16); // hsWaveCount = mergedWaveInfo[24:16]
+    hsPatchCount = builder.CreateAnd(hsPatchCount, 0xFF);
     const auto hsPatchCountStart = m_pipelineState->getShaderResourceUsage(ShaderStage::TessControl)
-                                       ->inOutUsage.tcs.calcFactor.onChip.hsPatchCountStart;
-    writeValueToLds(hasPatchCount, builder.getInt32(hsPatchCountStart), builder);
+                                       ->inOutUsage.tcs.hwConfig.onChip.hsPatchCountStart;
+    writeValueToLds(hsPatchCount, builder.getInt32(hsPatchCountStart), builder);
     builder.CreateBr(endDistribHsPatchCountBlock);
 
     // Construct ".endDistribHsPatchCount" block
@@ -522,8 +522,12 @@ Function *ShaderMerger::generateLsHsEntryPoint(Function *lsEntryPoint, Function
   // Construct ".endHs" block
   builder.SetInsertPoint(endHsBlock);
 
-  if (m_pipelineState->canOptimizeTessFactor())
-    storeTessFactorsWithOpt(threadIdInWave, builder);
+  if (m_pipelineState->canOptimizeTessFactor()) {
+    auto relativePatchId = builder.CreateAnd(relPatchId, builder.getInt32(0xFF));
+    auto vertexIdx = builder.CreateIntrinsic(Intrinsic::amdgcn_ubfe, {builder.getInt32Ty()},
+                                             {relPatchId, builder.getInt32(8), builder.getInt32(5)});
+    storeTessFactorsAndHsOutputsWithOpt(threadIdInWave, relativePatchId, vertexIdx, builder);
+  }
 
   builder.CreateRetVoid();
 
@@ -678,7 +682,7 @@ Function *ShaderMerger::generateEsGsEntryPoint(Function *esEntryPoint, Function
   //     Run GS
   // }
   //
-  const auto &calcFactor = m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.calcFactor;
+  const auto &hwConfig = m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.hwConfig;
 
   SmallVector<Argument *, 32> args;
   for (auto &arg : entryPoint->args())
@@ -724,7 +728,7 @@ Function *ShaderMerger::generateEsGsEntryPoint(Function *esEntryPoint, Function
                                                 {mergedWaveInfo, builder.getInt32(24), builder.getInt32(4)});
   waveInSubgroup->setName("waveInSubgroup");
 
-  auto esGsOffset = builder.CreateMul(waveInSubgroup, builder.getInt32(64 * calcFactor.esGsRingItemSize));
+  auto esGsOffset = builder.CreateMul(waveInSubgroup, builder.getInt32(64 * hwConfig.esGsRingItemSize));
 
   auto validEsVert = builder.CreateICmpULT(threadIdInWave, esVertCount, "validEsVert");
   builder.CreateCondBr(validEsVert, beginEsBlock, endEsBlock);
@@ -737,7 +741,7 @@ Function *ShaderMerger::generateEsGsEntryPoint(Function *esEntryPoint, Function
   Value *esGsOffsets01 = vgprArgs[0];
 
   Value *esGsOffsets23 = PoisonValue::get(builder.getInt32Ty());
-  if (calcFactor.inputVertices > 2 && geometryMode.inputPrimitive != InputPrimitives::Patch) {
+  if (hwConfig.inputVertices > 2 && geometryMode.inputPrimitive != InputPrimitives::Patch) {
     // NOTE: ES to GS offset (vertex 2 and 3) is valid once the primitive type has more than 2 vertices.
     esGsOffsets23 = vgprArgs[1];
   }
@@ -746,7 +750,7 @@ Function *ShaderMerger::generateEsGsEntryPoint(Function *esEntryPoint, Function
   Value *invocationId = vgprArgs[3];
 
   Value *esGsOffsets45 = PoisonValue::get(builder.getInt32Ty());
-  if (calcFactor.inputVertices > 4 && geometryMode.inputPrimitive != InputPrimitives::Patch) {
+  if (hwConfig.inputVertices > 4 && geometryMode.inputPrimitive != InputPrimitives::Patch) {
     // NOTE: ES to GS offset (vertex 4 and 5) is valid once the primitive type has more than 4 vertices.
     esGsOffsets45 = vgprArgs[4];
   }
@@ -965,13 +969,12 @@ void ShaderMerger::processRayQueryLdsStack(Function *entryPoint1, Function *entr
 
     if (shaderStage == ShaderStage::TessControl) {
       // Must be LS-HS merged shader
-      const auto &calcFactor =
-          m_pipelineState->getShaderResourceUsage(ShaderStage::TessControl)->inOutUsage.tcs.calcFactor;
-      hasLdsStack = calcFactor.rayQueryLdsStackSize > 0;
+      const auto &hwConfig = m_pipelineState->getShaderResourceUsage(ShaderStage::TessControl)->inOutUsage.tcs.hwConfig;
+      hasLdsStack = hwConfig.rayQueryLdsStackSize > 0;
     } else {
       // Must be ES-GS merged shader or NGG primitive shader
-      const auto &calcFactor = m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.calcFactor;
-      hasLdsStack = calcFactor.rayQueryLdsStackSize > 0;
+      const auto &hwConfig = m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.hwConfig;
+      hasLdsStack = hwConfig.rayQueryLdsStackSize > 0;
     }
 
     if (hasLdsStack) {
@@ -997,21 +1000,31 @@ void ShaderMerger::processRayQueryLdsStack(Function *entryPoint1, Function *entr
 }
 
 // =====================================================================================================================
-// Handle the store of tessellation factors with optimization (TF0/TF1 messaging)
+// Handle the store of tessellation factors with optimization (TF0/TF1 messaging) and the store of HS outputs to
+// off-chip LDS buffer if the patch is valid (all of its outer TFs are greater than zero).
 //
 // @param threadIdInWave : Thread ID in wave
+// @param relPatchId : Relative patch ID (output patch ID in group)
+// @param vertexIdx : Vertex indexing (output control point ID)
 // @param builder : IR builder to insert instructions
-void ShaderMerger::storeTessFactorsWithOpt(Value *threadIdInWave, IRBuilder<> &builder) {
+void ShaderMerger::storeTessFactorsAndHsOutputsWithOpt(Value *threadIdInWave, Value *relPatchId, Value *vertexIdx,
+                                                       BuilderBase &builder) {
   assert(m_pipelineState->canOptimizeTessFactor());
 
   //
   // The processing is something like this:
   //
-  // OPTIMIZED_TF_STORE() {
+  // OPTIMIZED_TF_STORE_AND_HS_OUTPUTS_STORE() {
+  //   if (threadIdInWave < hsVertexCount) {
+  //     Read TFs from LDS (each thread corresponds to an output vertex)
+  //     if (outerTfs > 0.0)
+  //       Write HS outputs to off-chip LDS buffer
+  //   }
+  //
   //   Read hsPatchCount from LDS
   //
   //   if (threadIdInGroup < hsPatchCount) {
-  //     Read TFs from LDS (with a barrier to make sure TFs are written)
+  //     Read TFs from LDS (each thread corresponds to a patch)
   //     Compute per-thread specielTf
   //     Compute per-wave specielTf
   //   }
@@ -1038,12 +1051,18 @@ void ShaderMerger::storeTessFactorsWithOpt(Value *threadIdInWave, IRBuilder<> &b
   // }
   //
 
+  const auto fastMathFlags = builder.getFastMathFlags();
+  FastMathFlags newFastMathFlags(fastMathFlags);
+  newFastMathFlags.setNoNaNs(); // Set NoNaNs flag to let LLVM optimize floating-point min/max/eq in this algorithm.
+  builder.setFastMathFlags(newFastMathFlags);
+
   auto insertBlock = builder.GetInsertBlock();
   auto entryPoint = insertBlock->getParent();
   assert(entryPoint->getName() == lgcName::LsHsEntryPoint); // Must be LS-HS merged shader
 
-  const auto &calcFactor = m_pipelineState->getShaderResourceUsage(ShaderStage::TessControl)->inOutUsage.tcs.calcFactor;
-  const unsigned waveSize = m_pipelineState->getMergedShaderWaveSize(ShaderStage::TessControl);
+  const auto &inOutUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::TessControl)->inOutUsage;
+  const auto &hwConfig = inOutUsage.tcs.hwConfig;
+  const unsigned waveSize = m_pipelineState->getShaderWaveSize(ShaderStage::TessControl);
   assert(waveSize == 32 || waveSize == 64);
 
   // Helper to create a basic block
@@ -1084,19 +1103,36 @@ void ShaderMerger::storeTessFactorsWithOpt(Value *threadIdInWave, IRBuilder<> &b
   auto storeTfBlock = createBlock(".storeTf");
   auto endTryStoreTfBlock = createBlock(".endTryStoreTf");
 
+  auto tryStoreHsOutputsBlock = createBlock(".tryStoreHsOutputs");
+  auto endTryStoreHsOutputsBlock = createBlock(".endTryStoreHsOutputs");
+
   // Construct current insert block
+  Type *bufferDescTy = FixedVectorType::get(builder.getInt32Ty(), 4);
+  Value *globalTablePtr = nullptr;
   Value *waveIdInGroup = nullptr;
   Value *threadIdInGroup = nullptr;
   Value *hsPatchCount = nullptr;
   Value *validHsPatch = nullptr;
   {
+    auto userData = getFunctionArgument(entryPoint, NumSpecialSgprInputs);
+    auto globalTable = builder.CreateExtractElement(
+        userData, static_cast<uint64_t>(0)); // The first element of user data argument is always internal global table
+
+    Value *pc = builder.CreateIntrinsic(Intrinsic::amdgcn_s_getpc, {}, {});
+    pc = builder.CreateBitCast(pc, FixedVectorType::get(builder.getInt32Ty(), 2));
+
+    globalTablePtr = builder.CreateInsertElement(pc, globalTable, static_cast<uint64_t>(0));
+    globalTablePtr = builder.CreateBitCast(globalTablePtr, builder.getInt64Ty());
+    globalTablePtr =
+        builder.CreateIntToPtr(globalTablePtr, PointerType::get(bufferDescTy, ADDR_SPACE_CONST), "globalTablePtr");
+
     waveIdInGroup = getFunctionArgument(entryPoint, getSpecialSgprInputIndex(m_gfxIp, LsHs::waveIdInGroup));
     waveIdInGroup = builder.CreateAnd(waveIdInGroup, 0x1F, "waveIdInGroup"); // waveIdInGroup = [4:0]
 
     threadIdInGroup = builder.CreateMul(builder.getInt32(waveSize), waveIdInGroup);
     threadIdInGroup = builder.CreateAdd(threadIdInGroup, threadIdInWave, "threadIdInGroup");
 
-    const auto hsPatchCountStart = calcFactor.onChip.hsPatchCountStart;
+    const auto hsPatchCountStart = hwConfig.onChip.hsPatchCountStart;
     hsPatchCount = readValueFromLds(builder.getInt32Ty(), builder.getInt32(hsPatchCountStart), builder);
     hsPatchCount = builder.CreateIntrinsic(builder.getInt32Ty(), Intrinsic::amdgcn_readfirstlane, hsPatchCount);
     hsPatchCount->setName("hsPatchCount");
@@ -1117,45 +1153,40 @@ void ShaderMerger::storeTessFactorsWithOpt(Value *threadIdInWave, IRBuilder<> &b
     outerTf = tessFactors.first;
     innerTf = tessFactors.second;
 
-    // Check special TFs
-    Value *one = ConstantFP::get(builder.getFloatTy(), 1.0);
-    Value *zero = ConstantFP::get(builder.getFloatTy(), 0.0);
-
-    Value *isAllOnesTf = builder.getTrue();
-    Value *isAllZerosTf = builder.getTrue();
-
     // Check if the thread has all-ones/all-zeros TFs
-    for (unsigned i = 0; i < cast<FixedVectorType>(outerTf->getType())->getNumElements(); ++i) {
-      auto elem = builder.CreateExtractElement(outerTf, i);
-      Value *isOne = builder.CreateFCmpOEQ(elem, one);
-      Value *isZero = builder.CreateFCmpOEQ(elem, zero);
-
-      isAllOnesTf = builder.CreateAnd(isAllOnesTf, isOne);
-      isAllZerosTf = builder.CreateAnd(isAllZerosTf, isZero);
+    auto minTf = builder.CreateExtractElement(outerTf, static_cast<uint64_t>(0));
+    auto maxTf = minTf;
+    for (unsigned i = 1; i < cast<FixedVectorType>(outerTf->getType())->getNumElements(); ++i) {
+      auto elemTf = builder.CreateExtractElement(outerTf, i);
+      minTf = builder.CreateBinaryIntrinsic(Intrinsic::minimum, minTf, elemTf);
+      maxTf = builder.CreateBinaryIntrinsic(Intrinsic::maximum, maxTf, elemTf);
     }
 
-    // Check inner tessellation factors
     if (innerTf) {
       // Isoline doesn't have inner tessellation factors
       for (unsigned i = 0; i < cast<FixedVectorType>(innerTf->getType())->getNumElements(); ++i) {
-        auto elem = builder.CreateExtractElement(innerTf, i);
-        Value *isOne = builder.CreateFCmpOEQ(elem, one);
-        Value *isZero = builder.CreateFCmpOEQ(elem, zero);
-
-        isAllOnesTf = builder.CreateAnd(isAllOnesTf, isOne);
-        isAllZerosTf = builder.CreateAnd(isAllZerosTf, isZero);
+        auto elemTf = builder.CreateExtractElement(innerTf, i);
+        minTf = builder.CreateBinaryIntrinsic(Intrinsic::minimum, minTf, elemTf);
+        maxTf = builder.CreateBinaryIntrinsic(Intrinsic::maximum, maxTf, elemTf);
       }
     }
 
-    auto validhMask = ballot(builder.getTrue());
+    auto minTfEqMaxTf = builder.CreateFCmpOEQ(minTf, maxTf);
+    Value *isOne = builder.CreateFCmpOEQ(minTf, ConstantFP::get(builder.getFloatTy(), 1.0));
+    Value *isZero = builder.CreateFCmpOEQ(minTf, ConstantFP::get(builder.getFloatTy(), 0.0));
+
+    auto isAllOnesTf = builder.CreateAnd(minTfEqMaxTf, isOne);
+    auto isAllZerosTf = builder.CreateAnd(minTfEqMaxTf, isZero);
+
+    auto validMask = ballot(builder.getTrue());
 
     // Check if the wave has all-ones TFs uniformly
     Value *allOnesTfMask = ballot(isAllOnesTf);
-    auto isAllOnesTfInWave = builder.CreateICmpEQ(allOnesTfMask, validhMask);
+    auto isAllOnesTfInWave = builder.CreateICmpEQ(allOnesTfMask, validMask);
 
     // Check if the wave has all-zeros TFs uniformly
     Value *allZerosTfMask = ballot(isAllZerosTf);
-    auto isAllZerosTfInWave = builder.CreateICmpEQ(allZerosTfMask, validhMask);
+    auto isAllZerosTfInWave = builder.CreateICmpEQ(allZerosTfMask, validMask);
 
     specialTfInWave = std::make_pair(isAllOnesTfInWave, isAllZerosTfInWave);
 
@@ -1195,7 +1226,7 @@ void ShaderMerger::storeTessFactorsWithOpt(Value *threadIdInWave, IRBuilder<> &b
   {
     builder.SetInsertPoint(handleMultiWaveBlock);
 
-    const unsigned specialTfValueStart = calcFactor.onChip.specialTfValueStart;
+    const unsigned specialTfValueStart = hwConfig.onChip.specialTfValueStart;
 
     // ldsOffset = specialTfValueStart + 2 * waveIdInGroup
     auto ldsOffset = builder.CreateAdd(builder.getInt32(specialTfValueStart), builder.CreateShl(waveIdInGroup, 1));
@@ -1220,7 +1251,7 @@ void ShaderMerger::storeTessFactorsWithOpt(Value *threadIdInWave, IRBuilder<> &b
   {
     builder.SetInsertPoint(checkSpecilTfInGroupBlock);
 
-    const unsigned specialTfValueStart = calcFactor.onChip.specialTfValueStart;
+    const unsigned specialTfValueStart = hwConfig.onChip.specialTfValueStart;
 
     // ldsOffset = specialTfValueStart + 2 * threadIdInWave
     auto ldsOffset = builder.CreateAdd(builder.getInt32(specialTfValueStart), builder.CreateShl(threadIdInWave, 1));
@@ -1307,23 +1338,10 @@ void ShaderMerger::storeTessFactorsWithOpt(Value *threadIdInWave, IRBuilder<> &b
   {
     builder.SetInsertPoint(storeTfBlock);
 
-    auto userData = getFunctionArgument(entryPoint, NumSpecialSgprInputs);
-    auto globalTable = builder.CreateExtractElement(
-        userData, static_cast<uint64_t>(0)); // The first element of user data argument is always internal global table
-
-    Value *pc = builder.CreateIntrinsic(Intrinsic::amdgcn_s_getpc, {}, {});
-    pc = builder.CreateBitCast(pc, FixedVectorType::get(builder.getInt32Ty(), 2));
-
-    Value *globalTablePtr = builder.CreateInsertElement(pc, globalTable, static_cast<uint64_t>(0));
-    globalTablePtr = builder.CreateBitCast(globalTablePtr, builder.getInt64Ty());
-    Type *tfBufferDescTy = FixedVectorType::get(builder.getInt32Ty(), 4);
-    globalTablePtr =
-        builder.CreateIntToPtr(globalTablePtr, PointerType::get(tfBufferDescTy, ADDR_SPACE_CONST), "globalTablePtr");
-
     Value *tfBufferDescPtr =
         builder.CreateConstGEP1_32(builder.getInt8Ty(), globalTablePtr, SiDrvTableTfBufferOffs * 4, "tfBufferDescPtr");
-    auto tfBufferDesc = builder.CreateLoad(tfBufferDescTy, tfBufferDescPtr, "tfBufferDesc");
-    Value *tfBufferBase = getFunctionArgument(entryPoint, getSpecialSgprInputIndex(m_gfxIp, LsHs::TfBufferBase));
+    auto tfBufferDesc = builder.CreateLoad(bufferDescTy, tfBufferDescPtr, "tfBufferDesc");
+    auto tfBufferBase = getFunctionArgument(entryPoint, getSpecialSgprInputIndex(m_gfxIp, LsHs::TfBufferBase));
 
     // Store TFs to TF buffer
     PreparePipelineAbi::writeTessFactors(m_pipelineState, tfBufferDesc, tfBufferBase, threadIdInGroup, outerTf, innerTf,
@@ -1334,8 +1352,41 @@ void ShaderMerger::storeTessFactorsWithOpt(Value *threadIdInWave, IRBuilder<> &b
   // Construct ".endTryStoreTf" block
   {
     builder.SetInsertPoint(endTryStoreTfBlock);
+
+    // hsVertexCount = mergeWaveInfo[15:8]
+    auto mergeWaveInfo = getFunctionArgument(entryPoint, getSpecialSgprInputIndex(m_gfxIp, LsHs::MergedWaveInfo));
+    auto hsVertexCount = builder.CreateIntrinsic(Intrinsic::amdgcn_ubfe, {builder.getInt32Ty()},
+                                                 {mergeWaveInfo, builder.getInt32(8), builder.getInt32(8)});
+    hsVertexCount->setName("hsVertexCount");
+
+    auto validHsVertex = builder.CreateICmpULT(threadIdInWave, hsVertexCount, "validHsVertex");
+    builder.CreateCondBr(validHsVertex, tryStoreHsOutputsBlock, endTryStoreHsOutputsBlock);
+  }
+
+  // Construct ".tryStoreHsOutputs" block
+  {
+    builder.SetInsertPoint(tryStoreHsOutputsBlock);
+
+    Value *offChipLdsDescPtr = builder.CreateConstGEP1_32(builder.getInt8Ty(), globalTablePtr,
+                                                          SiDrvTableHsBufferOffs * 4, "offChipLdsDescPtr");
+    auto offChipLdsDesc = builder.CreateLoad(bufferDescTy, offChipLdsDescPtr, "offChipLdsDesc");
+    auto offChipLdsBase = getFunctionArgument(entryPoint, getSpecialSgprInputIndex(m_gfxIp, LsHs::OffChipLdsBase));
+
+    // Store HS outputs to off-chip LDS buffer
+    const auto &[outerTf, innerTf] = PreparePipelineAbi::readTessFactors(m_pipelineState, relPatchId, builder);
+    PreparePipelineAbi::writeHsOutputs(m_pipelineState, offChipLdsDesc, offChipLdsBase, relPatchId, vertexIdx, outerTf,
+                                       builder);
+
+    builder.CreateBr(endTryStoreHsOutputsBlock);
+  }
+
+  // Construct ".endTryStoreHsOutputs" block
+  {
+    builder.SetInsertPoint(endTryStoreHsOutputsBlock);
     // Do nothing
   }
+
+  builder.setFastMathFlags(fastMathFlags); // Restore fast math flags
 }
 
 // =====================================================================================================================
diff --git a/lgc/patch/ShaderMerger.h b/lgc/patch/ShaderMerger.h
index 587816aaf9..6cebf1aee2 100644
--- a/lgc/patch/ShaderMerger.h
+++ b/lgc/patch/ShaderMerger.h
@@ -119,7 +119,8 @@ class ShaderMerger {
 
   void processRayQueryLdsStack(llvm::Function *entryPoint1, llvm::Function *entryPoint2) const;
 
-  void storeTessFactorsWithOpt(llvm::Value *threadIdInWave, llvm::IRBuilder<> &builder);
+  void storeTessFactorsAndHsOutputsWithOpt(llvm::Value *threadIdInWave, llvm::Value *relPatchId, llvm::Value *vertexIdx,
+                                           BuilderBase &builder);
   llvm::Value *readValueFromLds(llvm::Type *readTy, llvm::Value *ldsOffset, llvm::IRBuilder<> &builder);
   void writeValueToLds(llvm::Value *writeValue, llvm::Value *ldsOffset, llvm::IRBuilder<> &builder);
   void createBarrier(llvm::IRBuilder<> &builder);
diff --git a/lgc/patch/StructurizeBuffers.cpp b/lgc/patch/StructurizeBuffers.cpp
index 9519a73125..16e695bac1 100644
--- a/lgc/patch/StructurizeBuffers.cpp
+++ b/lgc/patch/StructurizeBuffers.cpp
@@ -182,7 +182,7 @@ bool StructurizeBuffersImpl::run() {
         strided = m_builder.create<StridedIndexAddOp>(strided, bufferIndexOp->getIndex());
 
         toRemove.push_back(bufferIndexOp);
-        CompilerUtils::replaceAllPointerUses(&m_builder, bufferIndexOp, strided, toRemove);
+        CompilerUtils::replaceAllPointerUses(bufferIndexOp, strided, toRemove);
       }
     }
   }
diff --git a/lgc/patch/SystemValues.cpp b/lgc/patch/SystemValues.cpp
index 9ef1d855ae..fe74678da8 100644
--- a/lgc/patch/SystemValues.cpp
+++ b/lgc/patch/SystemValues.cpp
@@ -197,7 +197,7 @@ Value *ShaderSystemValues::getOffChipLdsDesc() {
   if (!m_offChipLdsDesc) {
     // Ensure we have got the global table pointer first, and insert new code after that.
     BuilderBase builder(getInternalGlobalTablePtr()->getNextNode());
-    m_offChipLdsDesc = loadDescFromDriverTable(SiDrvTableHsBuffeR0Offs, builder);
+    m_offChipLdsDesc = loadDescFromDriverTable(SiDrvTableHsBufferOffs, builder);
   }
   return m_offChipLdsDesc;
 }
@@ -272,7 +272,7 @@ Value *ShaderSystemValues::getGsVsRingBufDesc(unsigned streamId) {
 
       unsigned streamItemOffset = 0;
       for (int i = 0; i < streamId; ++i)
-        streamItemOffset += resUsage->inOutUsage.gs.calcFactor.gsVsVertexItemSize[i] *
+        streamItemOffset += resUsage->inOutUsage.gs.hwConfig.gsVsVertexItemSize[i] *
                             m_pipelineState->getShaderModes()->getGeometryShaderMode().outputVertices;
 
       // streamSize[streamId] = outLocCount[streamId] * 4 * sizeof(unsigned)
@@ -295,7 +295,7 @@ Value *ShaderSystemValues::getGsVsRingBufDesc(unsigned streamId) {
 
       // Calculate and set stride in SRD dword1
       unsigned gsVsStride = m_pipelineState->getShaderModes()->getGeometryShaderMode().outputVertices *
-                            resUsage->inOutUsage.gs.calcFactor.gsVsVertexItemSize[streamId] * 4;
+                            resUsage->inOutUsage.gs.hwConfig.gsVsVertexItemSize[streamId] * 4;
 
       SqBufRsrcWord1 strideSetValue = {};
       strideSetValue.bits.stride = gsVsStride;
@@ -352,7 +352,7 @@ Value *ShaderSystemValues::getTotalEmitCounterPtr() {
 // Get internal global table pointer as pointer to i8.
 Instruction *ShaderSystemValues::getInternalGlobalTablePtr() {
   if (!m_internalGlobalTablePtr) {
-    auto ptrTy = Type::getInt8Ty(*m_context)->getPointerTo(ADDR_SPACE_CONST);
+    auto ptrTy = PointerType::get(*m_context, ADDR_SPACE_CONST);
     // Global table is always the first function argument (separate shader) or the eighth function argument (merged
     // shader). And mesh shader is actually mapped to ES-GS merged shader.
     m_internalGlobalTablePtr = makePointer(
@@ -386,7 +386,7 @@ Value *ShaderSystemValues::getMeshPipeStatsBufPtr() {
     }
     assert(entryArgIdx != 0);
 
-    auto ptrTy = Type::getInt8Ty(*m_context)->getPointerTo(ADDR_SPACE_GLOBAL);
+    auto ptrTy = PointerType::get(*m_context, ADDR_SPACE_GLOBAL);
     m_meshPipeStatsBufPtr =
         makePointer(getFunctionArgument(m_entryPoint, entryArgIdx, "meshPipeStatsBuf"), ptrTy, InvalidValue);
   }
diff --git a/lgc/patch/VertexFetch.cpp b/lgc/patch/VertexFetch.cpp
index 598b3d6dda..6485180b1c 100644
--- a/lgc/patch/VertexFetch.cpp
+++ b/lgc/patch/VertexFetch.cpp
@@ -732,8 +732,7 @@ Value *VertexFetchImpl::fetchVertex(LoadVertexInputOp *inst, Value *descPtr, Val
   if (!m_vertexBufTablePtr) {
     IRBuilderBase::InsertPointGuard ipg(builder);
     builder.SetInsertPointPastAllocas(inst->getFunction());
-    m_vertexBufTablePtr =
-        ShaderInputs::getSpecialUserDataAsPointer(UserDataMapping::VertexBufferTable, vbDescTy, builder);
+    m_vertexBufTablePtr = ShaderInputs::getSpecialUserDataAsPointer(UserDataMapping::VertexBufferTable, builder);
   }
 
   // Helper to create basic block
@@ -1482,7 +1481,9 @@ Value *VertexFetchImpl::loadVertexBufferDescriptor(unsigned binding, BuilderImpl
         auto descPtr = builderImpl.CreateBufferDesc(InternalDescriptorSetId, CurrentAttributeBufferBinding,
                                                     builderImpl.getInt32(0), lgc::Builder::BufferFlagAddress, false);
         // Create descriptor by a 64-bits pointer
-        m_curAttribBufferDescr = builderImpl.buildBufferCompactDesc(descPtr, 0);
+        descPtr = builderImpl.CreatePtrToInt(descPtr, builderImpl.getInt64Ty());
+        descPtr = builderImpl.CreateBitCast(descPtr, FixedVectorType::get(builderImpl.getInt32Ty(), 2));
+        m_curAttribBufferDescr = builderImpl.buildBufferCompactDesc(descPtr, nullptr);
       }
       vtxDesc = m_curAttribBufferDescr;
     } else {
@@ -1495,14 +1496,13 @@ Value *VertexFetchImpl::loadVertexBufferDescriptor(unsigned binding, BuilderImpl
   }
 
   // Get the vertex buffer table pointer as pointer to v4i32 descriptor.
-  Type *vbDescTy = FixedVectorType::get(Type::getInt32Ty(*m_context), 4);
   if (!m_vertexBufTablePtr) {
     IRBuilder<>::InsertPointGuard guard(builder);
     builder.SetInsertPointPastAllocas(builder.GetInsertPoint()->getFunction());
-    m_vertexBufTablePtr =
-        ShaderInputs::getSpecialUserDataAsPointer(UserDataMapping::VertexBufferTable, vbDescTy, builder);
+    m_vertexBufTablePtr = ShaderInputs::getSpecialUserDataAsPointer(UserDataMapping::VertexBufferTable, builder);
   }
 
+  Type *vbDescTy = FixedVectorType::get(Type::getInt32Ty(*m_context), 4);
   Value *vbDescPtr = builder.CreateGEP(vbDescTy, m_vertexBufTablePtr, builder.getInt64(binding));
   LoadInst *vbDesc = builder.CreateLoad(vbDescTy, vbDescPtr);
   vbDesc->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(vbDesc->getContext(), {}));
diff --git a/lgc/state/LgcContext.cpp b/lgc/state/LgcContext.cpp
index 8966b1583f..43e36b293d 100644
--- a/lgc/state/LgcContext.cpp
+++ b/lgc/state/LgcContext.cpp
@@ -30,13 +30,13 @@
  */
 #include "lgc/LgcContext.h"
 #include "lgc/Builder.h"
+#include "lgc/Debug.h"
 #include "lgc/LgcDialect.h"
 #include "lgc/PassManager.h"
 #include "lgc/patch/LgcLowering.h"
 #include "lgc/state/PassManagerCache.h"
 #include "lgc/state/PipelineState.h"
 #include "lgc/state/TargetInfo.h"
-#include "lgc/util/Debug.h"
 #include "lgc/util/Internal.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Bitcode/BitcodeWriterPass.h"
@@ -147,6 +147,8 @@ void LgcContext::initialize() {
   setOptionDefault("enable-phi-of-ops", "0");
   setOptionDefault("amdgpu-vgpr-index-mode", "1"); // force VGPR indexing on GFX8
   setOptionDefault("amdgpu-atomic-optimizer-strategy", "DPP");
+  // Relax occupancy target for amdgpu-memory-bound function.
+  setOptionDefault("amdgpu-schedule-relaxed-occupancy", "true");
   setOptionDefault("structurizecfg-skip-uniform-regions", "1");
   setOptionDefault("spec-exec-max-speculation-cost", "10");
 #if !defined(LLVM_HAVE_BRANCH_AMD_GFX)
@@ -168,12 +170,14 @@ std::string LgcContext::getGpuNameString(unsigned major, unsigned minor, unsigne
   // converts that to an LLVM target name, with is "gfx" followed by the three decimal numbers with
   // no separators, e.g. "gfx1010" for 10.1.0. A high stepping number 0xFFFA..0xFFFF denotes an
   // experimental target, and that is represented by the final hexadecimal digit, e.g. "gfx101A"
-  // for 10.1.0xFFFA. In gfx9, stepping numbers 10..35 are represented by lower case letter 'a'..'z'.
+  // for 10.1.0xFFFA.
   std::string gpuName;
   raw_string_ostream gpuNameStream(gpuName);
   gpuNameStream << "gfx" << major << minor;
   if (stepping >= 0xFFFA)
     gpuNameStream << char(stepping - 0xFFFA + 'A');
+  else if (stepping >= 0xA)
+    gpuNameStream << char(stepping - 0xA + 'A');
   else
     gpuNameStream << stepping;
 
diff --git a/lgc/state/PassManagerCache.cpp b/lgc/state/PassManagerCache.cpp
index bc563f9aa2..c49530e17a 100644
--- a/lgc/state/PassManagerCache.cpp
+++ b/lgc/state/PassManagerCache.cpp
@@ -95,7 +95,7 @@ std::pair<lgc::PassManager &, LegacyPassManager &> PassManagerCache::getPassMana
   fpm.addPass(InstSimplifyPass());
   fpm.addPass(EarlyCSEPass(true));
   passManagers.first->addPass(createModuleToFunctionPassAdaptor(std::move(fpm)));
-  passManagers.first->addPass(PatchSetupTargetFeatures());
+  passManagers.first->addPass(SetUpTargetFeatures());
   passManagers.first->addPass(IncludeLlvmIr());
 
   // Add one last pass that does nothing, but invalidates all the analyses.
diff --git a/lgc/state/PipelineState.cpp b/lgc/state/PipelineState.cpp
index 68a8bf98e8..497020f1a6 100644
--- a/lgc/state/PipelineState.cpp
+++ b/lgc/state/PipelineState.cpp
@@ -68,6 +68,8 @@ static const char RsStateMetadataName[] = "lgc.rasterizer.state";
 static const char ColorExportFormatsMetadataName[] = "lgc.color.export.formats";
 static const char ColorExportStateMetadataName[] = "lgc.color.export.state";
 static const char TessLevelMetadataName[] = "lgc.tessellation.level.state";
+static const char WaveSizeMetadataName[] = "lgc.wave.size";
+static const char SubgroupSizeMetadataName[] = "lgc.subgroup.size";
 
 namespace {
 
@@ -428,6 +430,8 @@ void PipelineState::clear(Module *module) {
   m_colorExportState = {};
   m_inputAssemblyState = {};
   m_rasterizerState = {};
+  memset(m_waveSize, 0, sizeof(m_waveSize));
+  memset(m_subgroupSize, 0, sizeof(m_subgroupSize));
   record(module);
 }
 
@@ -455,6 +459,7 @@ void PipelineState::recordExceptPalMetadata(Module *module) {
   recordVertexInputDescriptions(module);
   recordColorExportState(module);
   recordGraphicsState(module);
+  recordWaveSize(module);
 }
 
 // =====================================================================================================================
@@ -472,6 +477,7 @@ void PipelineState::readState(Module *module) {
   readGraphicsState(module);
   if (!m_palMetadata)
     m_palMetadata = new PalMetadata(this, module);
+  readWaveSize(module);
   setXfbStateMetadata(module);
 }
 
@@ -1302,6 +1308,54 @@ void PipelineState::readGraphicsState(Module *module) {
     m_rasterizerState.innerCoverage = 1;
 }
 
+// =====================================================================================================================
+// Record wave size for each shader stage into the IR metadata
+//
+// @param [in/out] module : IR module to record into
+void PipelineState::recordWaveSize(Module *module) {
+  // Wave size is not determined yet, do it first.
+  // NOTE: m_waveSize is set up in one go, so checking [0] is enough.
+  if (m_waveSize[0] == 0)
+    determineShaderWaveSize(module);
+
+  setNamedMetadataToArrayOfInt32(module, m_waveSize, WaveSizeMetadataName);
+  setNamedMetadataToArrayOfInt32(module, m_subgroupSize, SubgroupSizeMetadataName);
+}
+
+// =====================================================================================================================
+// Read wave size for each shader stage from the IR metadata
+//
+// @param [in/out] module : IR module to read from
+void PipelineState::readWaveSize(Module *module) {
+  readNamedMetadataArrayOfInt32(module, WaveSizeMetadataName, m_waveSize);
+  readNamedMetadataArrayOfInt32(module, SubgroupSizeMetadataName, m_subgroupSize);
+}
+
+// =====================================================================================================================
+// Determine shader wave size for each shader stage
+//
+// @param module : IR module
+void PipelineState::determineShaderWaveSize(Module *module) {
+  // Wave size determination depends on shader modes.
+  getShaderModes()->readModesFromPipeline(module);
+  setAllShadersDefaultWaveSize();
+  unsigned defaultWaveSize[ShaderStage::Count] = {};
+  for (unsigned stage = 0; stage < ShaderStage::Count; ++stage) {
+    defaultWaveSize[stage] = m_waveSize[stage];
+  }
+
+  for (unsigned stage = 0; stage < ShaderStage::Count; ++stage) {
+    unsigned waveSize = hasShaderStage(static_cast<ShaderStageEnum>(stage)) ? defaultWaveSize[stage] : 0;
+    auto mergingStage = getMergingShaderStage(static_cast<ShaderStageEnum>(stage));
+    unsigned mergingWaveSize = hasShaderStage(mergingStage) ? defaultWaveSize[mergingStage] : 0;
+    // Just use default wave size when neither stage is present.
+    if (waveSize == 0 && mergingWaveSize == 0)
+      continue;
+
+    m_waveSize[stage] = std::max(waveSize, mergingWaveSize);
+  }
+}
+
 // =====================================================================================================================
 // Get number of patch control points. The front-end supplies this as TessellationMode::inputVertices.
 unsigned PipelineState::getNumPatchControlPoints() const {
@@ -1313,9 +1367,10 @@ unsigned PipelineState::getNumPatchControlPoints() const {
 //
 // @param stage : Shader stage
 unsigned PipelineState::getShaderWaveSize(ShaderStageEnum stage) {
-  if (m_waveSize.empty()) {
+  // Wave size is not read from metadata, this may happen for cases that we don't have this info available (e.g. null
+  // fragment), or lit test where people don't explicitly write these metadata. In such cases, use default values.
+  if (m_waveSize[0] == 0)
     setAllShadersDefaultWaveSize();
-  }
 
   if (stage == ShaderStage::CopyShader) {
     // Treat copy shader as part of geometry shader
@@ -1323,17 +1378,16 @@ unsigned PipelineState::getShaderWaveSize(ShaderStageEnum stage) {
   }
 
   assert(ShaderStageMask(ShaderStagesNative).contains(stage));
-  return getMergedShaderWaveSize(stage);
+  assert(m_waveSize[stage] == 32 || m_waveSize[stage] == 64);
+  return m_waveSize[stage];
 }
 
 // =====================================================================================================================
-// Gets wave size for the merged shader stage
-//
-// NOTE: For GFX9+, two shaders are merged as a shader pair. The wave size is determined by the larger one.
+//  Gets merging shader stage for the specified shader stage
 //
 // @param stage : Shader stage
-unsigned PipelineState::getMergedShaderWaveSize(ShaderStageEnum stage) {
-  unsigned waveSize = m_waveSize[stage];
+ShaderStageEnum PipelineState::getMergingShaderStage(ShaderStageEnum stage) {
+  ShaderStageEnum mergingStage = stage;
 
   // NOTE: For GFX9+, two shaders are merged as a shader pair. The wave size is determined by the larger one. That is
   // to say:
@@ -1344,36 +1398,34 @@ unsigned PipelineState::getMergedShaderWaveSize(ShaderStageEnum stage) {
   switch (stage) {
   case ShaderStage::Vertex:
     if (hasShaderStage(ShaderStage::TessControl)) {
-      return std::max(waveSize, m_waveSize[ShaderStage::TessControl]);
-    }
-    if (hasShaderStage(ShaderStage::Geometry)) {
-      return std::max(waveSize, m_waveSize[ShaderStage::Geometry]);
+      mergingStage = ShaderStage::TessControl;
+    } else if (hasShaderStage(ShaderStage::Geometry)) {
+      mergingStage = ShaderStage::Geometry;
     }
-    return waveSize;
-
+    break;
   case ShaderStage::TessControl:
-    return std::max(waveSize, m_waveSize[ShaderStage::Vertex]);
-
+    mergingStage = ShaderStage::Vertex;
+    break;
   case ShaderStage::TessEval:
     if (hasShaderStage(ShaderStage::Geometry)) {
-      return std::max(waveSize, m_waveSize[ShaderStage::Geometry]);
+      mergingStage = ShaderStage::Geometry;
     }
-    return waveSize;
-
+    break;
   case ShaderStage::Geometry:
-    if (!hasShaderStage(ShaderStage::Geometry)) {
-      // NGG, no geometry
-      return std::max(waveSize,
-                      m_waveSize[hasShaderStage(ShaderStage::TessEval) ? ShaderStage::TessEval : ShaderStage::Vertex]);
-    }
     if (hasShaderStage(ShaderStage::TessEval)) {
-      return std::max(waveSize, m_waveSize[ShaderStage::TessEval]);
+      mergingStage = ShaderStage::TessEval;
+    } else {
+      mergingStage = ShaderStage::Vertex;
     }
-    return std::max(waveSize, m_waveSize[ShaderStage::Vertex]);
-
+    break;
+  case ShaderStage::CopyShader:
+    mergingStage = ShaderStage::Geometry;
+    break;
   default:
-    return waveSize;
+    break;
   }
+
+  return mergingStage;
 }
 
 // =====================================================================================================================
@@ -1490,9 +1542,10 @@ unsigned PipelineState::getShaderHwStageMask(ShaderStageEnum stage) {
 // @param stage : Shader stage
 // @returns : Subgroup size of the specified shader stage
 unsigned PipelineState::getShaderSubgroupSize(ShaderStageEnum stage) {
-  if (m_subgroupSize.empty()) {
+  // Subgroup size is not read from metadata, this may happen for cases that we don't have this info available (e.g.
+  // null fragment), or lit test where people don't explicitly write these metadata. In such cases, use default values.
+  if (m_subgroupSize[0] == 0)
     setAllShadersDefaultWaveSize();
-  }
 
   if (stage == ShaderStage::CopyShader) {
     // Treat copy shader as part of geometry shader
@@ -1516,12 +1569,6 @@ void PipelineState::setAllShadersDefaultWaveSize() {
 //
 // @param stage : Shader stage
 void PipelineState::setShaderDefaultWaveSize(ShaderStageEnum stage) {
-  if (stage == ShaderStage::Geometry && !hasShaderStage(ShaderStage::Geometry)) {
-    // NOTE: For NGG, GS could be absent and VS/TES acts as part of it in the merged shader.
-    // In such cases, we check the property of VS or TES, and this will be handled in getMergedShaderWaveSize.
-    return;
-  }
-
   if (stage == ShaderStage::Compute) {
     const unsigned subgroupSize = m_shaderModes.getComputeShaderMode().subgroupSize;
     m_waveSize[stage] = subgroupSize;
@@ -1743,15 +1790,15 @@ InterfaceData *PipelineState::getShaderInterfaceData(ShaderStageEnum shaderStage
 // @param rtStack : Get size of LDS RayQuery stack for this stage
 // @return LDS size in dwords
 unsigned PipelineState::getShaderStaticLdsUsage(ShaderStageEnum shaderStage, bool rtStack) {
-  const ResourceUsage *RU = getShaderResourceUsage(shaderStage);
+  const ResourceUsage *resUsage = getShaderResourceUsage(shaderStage);
   switch (shaderStage) {
   case ShaderStage::TessControl: {
-    const auto &calcFactor = RU->inOutUsage.tcs.calcFactor;
-    return rtStack ? calcFactor.rayQueryLdsStackSize : calcFactor.tessOnChipLdsSize;
+    const auto &hwConfig = resUsage->inOutUsage.tcs.hwConfig;
+    return rtStack ? hwConfig.rayQueryLdsStackSize : hwConfig.tessOnChipLdsSize;
   }
   case ShaderStage::Geometry: {
-    const auto &calcFactor = RU->inOutUsage.gs.calcFactor;
-    return rtStack ? calcFactor.rayQueryLdsStackSize : calcFactor.gsOnChipLdsSize;
+    const auto &hwConfig = resUsage->inOutUsage.gs.hwConfig;
+    return rtStack ? hwConfig.rayQueryLdsStackSize : hwConfig.gsOnChipLdsSize;
   }
   default:
     return 0;
diff --git a/lgc/state/ResourceUsage.cpp b/lgc/state/ResourceUsage.cpp
index 2e09897e4d..7c3481fc02 100644
--- a/lgc/state/ResourceUsage.cpp
+++ b/lgc/state/ResourceUsage.cpp
@@ -43,9 +43,9 @@ ResourceUsage::ResourceUsage(ShaderStageEnum shaderStage) {
     builtInUsage.vs.baseVertex = true;
     builtInUsage.vs.baseInstance = true;
   } else if (shaderStage == ShaderStage::TessControl) {
-    inOutUsage.tcs.calcFactor = {};
+    inOutUsage.tcs.hwConfig = {};
   } else if (shaderStage == ShaderStage::Geometry) {
-    inOutUsage.gs.calcFactor = {};
+    inOutUsage.gs.hwConfig = {};
   } else if (shaderStage == ShaderStage::Fragment) {
     for (uint32_t i = 0; i < MaxColorTargets; ++i) {
       inOutUsage.fs.outputTypes[i] = BasicType::Unknown;
diff --git a/lgc/test/BuiltIns/cs-numworkgroups.lgc b/lgc/test/BuiltIns/cs-numworkgroups.lgc
index 553a5a8a6c..d7d16e8a66 100644
--- a/lgc/test/BuiltIns/cs-numworkgroups.lgc
+++ b/lgc/test/BuiltIns/cs-numworkgroups.lgc
@@ -53,7 +53,7 @@ attributes #0 = { nounwind }
 ; CHECK-NEXT:       .cs:
 ; CHECK-NEXT:         .checksum_value: 0
 ; CHECK-NEXT:         .debug_mode:     false
-; CHECK-NEXT:         .entry_point:    _amdgpu_cs_main
+; CHECK-NEXT:         .entry_point{{(_symbol)?}}:    _amdgpu_cs_main
 ; CHECK-NEXT:         .excp_en:        0
 ; CHECK-NEXT:         .float_mode:     0xc0
 ; CHECK-NEXT:         .ieee_mode:      false
diff --git a/lgc/test/BuiltIns/cs-workgroupid.lgc b/lgc/test/BuiltIns/cs-workgroupid.lgc
index fc1154a056..b917de05a2 100644
--- a/lgc/test/BuiltIns/cs-workgroupid.lgc
+++ b/lgc/test/BuiltIns/cs-workgroupid.lgc
@@ -50,7 +50,7 @@ attributes #0 = { nounwind }
 ; CHECK-NEXT:       .cs:
 ; CHECK-NEXT:         .checksum_value: 0
 ; CHECK-NEXT:         .debug_mode:     false
-; CHECK-NEXT:         .entry_point:    _amdgpu_cs_main
+; CHECK-NEXT:         .entry_point{{(_symbol)?}}:    _amdgpu_cs_main
 ; CHECK-NEXT:         .excp_en:        0
 ; CHECK-NEXT:         .float_mode:     0xc0
 ; CHECK-NEXT:         .ieee_mode:      false
diff --git a/lgc/test/CsComputeLibrary.lgc b/lgc/test/CsComputeLibrary.lgc
index 933d317855..e88ff1fefb 100644
--- a/lgc/test/CsComputeLibrary.lgc
+++ b/lgc/test/CsComputeLibrary.lgc
@@ -1,6 +1,6 @@
 ; Define a compute library that can be called from a compute shader.
 
-; RUN: lgc -mcpu=gfx1010 -print-after=lgc-mutate-entry-point -print-after=lgc-patch-prepare-pipeline-abi -print-after=lgc-patch-setup-target-features -o /dev/null 2>&1 - <%s | FileCheck --check-prefixes=CHECK %s
+; RUN: lgc -mcpu=gfx1010 -print-after=lgc-mutate-entry-point -print-after=lgc-patch-prepare-pipeline-abi -print-after=lgc-set-up-target-features -o /dev/null 2>&1 - <%s | FileCheck --check-prefixes=CHECK %s
 ; CHECK: IR Dump After Mutate entry point
 ; CHECK: define amdgpu_gfx void @func(i32 inreg noundef %globalTable, ptr addrspace(4) inreg noundef %numWorkgroupsPtr, i32 inreg noundef %userdata0, i32 inreg noundef %userdata1, i32 inreg noundef %userdata2, i32 inreg noundef %userdata3, i32 inreg noundef %userdata4, i32 inreg noundef %userdata5, i32 inreg noundef %userdata6, i32 inreg noundef %userdata7, i32 inreg noundef %userdata8, i32 inreg noundef %userdata9, i32 inreg noundef %userdata10, i32 inreg noundef %userdata11, i32 inreg noundef %spillTable, <3 x i32> inreg noundef %WorkgroupId, i32 inreg noundef %MultiDispatchInfo, <3 x i32> noundef %LocalInvocationId) #0 !lgc.shaderstage !7 {
 ; CHECK: !7 = !{i32 7}
diff --git a/lgc/test/CsLowerDebugPrintf.lgc b/lgc/test/CsLowerDebugPrintf.lgc
index e186777b71..386f866284 100644
--- a/lgc/test/CsLowerDebugPrintf.lgc
+++ b/lgc/test/CsLowerDebugPrintf.lgc
@@ -48,7 +48,7 @@ declare <3 x i32> @lgc.reconfigure.local.invocation.id(<3 x i32>, i32) #1
 declare ptr addrspace(4) @lgc.descriptor.table.addr(i32, i32, i32, i32, i32) #1
 
 ; Function Attrs: nounwind willreturn memory(none)
-declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>) #2
+declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>, i1) #2
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind memory(none) }
@@ -66,7 +66,7 @@ attributes #2 = { nounwind willreturn memory(none) }
 !2 = !{i32 -158142355, i32 1527082450, i32 -1021507957, i32 -258612566, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 2}
 !3 = !{i32 -297365566, i32 -915288882, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800}
 !4 = !{!"DescriptorTableVaPtr", i32 7, i32 0, i32 0, i32 1, i32 1}
-!5 = !{!"DescriptorBuffer", i32 6, i32 0, i32 0, i32 4, i32 -1, i32 6, i32 4}
+!5 = !{!"DescriptorBuffer", i32 6, i32 0, i32 0, i32 4, i32 -16, i32 6, i32 4}
 !6 = !{!"\82\B0amdpal.pipelines\91\84\AA.registers\80\B0.spill_threshold\CE\FF\FF\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CFIv3\E9\C4\9B\CEd\CF31\1AG\C6\99\D0\DE\AD.llpc_version\A461.0\AEamdpal.version\92\02\06"}
 !7 = !{i32 5}
 !8 = !{i32 7}
@@ -80,7 +80,7 @@ attributes #2 = { nounwind willreturn memory(none) }
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr addrspace(4)
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr addrspace(4) [[TMP5]], i32 4), "dereferenceable"(ptr addrspace(4) [[TMP5]], i32 -1) ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = call ptr addrspace(7) @lgc.buffer.load.desc.to.ptr(ptr addrspace(4) [[TMP6]], i1 false, i1 false)
+; CHECK-NEXT:    [[TMP7:%.*]] = call ptr addrspace(7) @lgc.buffer.load.desc.to.ptr(ptr addrspace(4) [[TMP6]], i1 false, i1 false, i1 false)
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <3 x i32> @lgc.shader.input.WorkgroupId(i32 0) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <3 x i32> [[TMP8]], {{(splat \(i32 1\))|(<i32 1, i32 1, i32 1>)}}
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @lgc.shader.input.LocalInvocationId(i32 49) #[[ATTR2]]
diff --git a/lgc/test/ImageSampleNoReturn.lgc b/lgc/test/ImageSampleNoReturn.lgc
index 0cbd540fc4..f7c0a38f8c 100644
--- a/lgc/test/ImageSampleNoReturn.lgc
+++ b/lgc/test/ImageSampleNoReturn.lgc
@@ -39,7 +39,7 @@ attributes #1 = { nounwind readnone }
 ; CHECK-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(4)
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr addrspace(4) [[TMP4]], i32 4), "dereferenceable"(ptr addrspace(4) [[TMP4]], i32 -1) ]
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP4]], align 4, !invariant.load !3
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP4]], align 4, !invariant.load !4
 ; CHECK-NEXT:    call void @llvm.amdgcn.image.sample.2d.nortn.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, half 0xH0000, half 0xH0000, <8 x i32> [[TMP5]], <4 x i32> <i32 12288, i32 117436416, i32 1750073344, i32 -2147483648>, i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
diff --git a/lgc/test/PeepholeOptPhiWithIdenticalLoad.lgc b/lgc/test/PeepholeOptPhiWithIdenticalLoad.lgc
index d9a66b26e1..fe469fddae 100644
--- a/lgc/test/PeepholeOptPhiWithIdenticalLoad.lgc
+++ b/lgc/test/PeepholeOptPhiWithIdenticalLoad.lgc
@@ -3,6 +3,7 @@
 ; RUN: lgc -mcpu=gfx1010 -print-after=lgc-peephole-optimization -o - - <%s 2>&1 | FileCheck --check-prefixes=CHECK %s
 
 ; CHECK: [[LOAD:%[0-9a-z]*]] = load i32, ptr addrspace(3) @lds, align 16
+; CHECK: [[GEP:%[0-9a-z]*]] = getelementptr <{ [0 x [3 x i32]] }>, ptr addrspace(1)
 ; CHECK: [[PHI:%[0-9a-z]*]] = phi i32 [ [[LOAD]], %.entry ], [ [[INC:%[0-9a-z]*]], %.block2 ]
 ; CHECK: .block2:
 ; CHECK-NEXT: [[INC]] = add i32 [[PHI]], 1
@@ -13,9 +14,18 @@ target triple = "amdgcn--amdpal"
 @lds = addrspace(3) global i32 undef, align 16
 
 ; Function Attrs: nounwind
-define dllexport spir_func void @lgc.shader.CS.main() local_unnamed_addr #0 !lgc.shaderstage !0 {
+define dllexport spir_func void @lgc.shader.CS.main(i32 inreg noundef %userdata9, i32 inreg noundef %userdata10) local_unnamed_addr #0 !lgc.shaderstage !0 {
 .entry:
   %load0 = load i32, ptr addrspace(3) @lds, align 16
+  %27 = insertelement <2 x i32> poison, i32 %userdata9, i64 0
+  %28 = insertelement <2 x i32> %27, i32 %userdata10, i64 1
+  %29 = bitcast <2 x i32> %28 to i64
+  %30 = add i64 %29, 2621456
+  %31 = inttoptr i64 %30 to ptr addrspace(1)
+  %32 = insertelement <3 x i32> <i32 poison, i32 1, i32 1>, i32 0, i64 0
+  %33 = zext i32 %load0 to i64
+  %34 = getelementptr <{ [0 x [3 x i32]] }>, ptr addrspace(1) %31, i64 0, i32 0, i64 %33
+  store <3 x i32> %32, ptr addrspace(1) %34, align 4
   br label %.block0
 
 .block0:                                          ; preds = %.entry
@@ -40,4 +50,13 @@ define dllexport spir_func void @lgc.shader.CS.main() local_unnamed_addr #0 !lgc
   ret void
 }
 
+attributes #3 = { nodivergencesource nounwind willreturn memory(read) }
+!lgc.user.data.nodes = !{!4, !5, !6, !7, !8, !9}
+
 !0 = !{i32 7}
+!4 = !{!"DescriptorTableVaPtr", i32 7, i32 128, i32 15, i32 1, i32 3}
+!5 = !{!"DescriptorBufferCompact", i32 10, i32 128, i32 0, i32 2, i64 93, i32 17, i32 2}
+!6 = !{!"DescriptorBuffer", i32 6, i32 128, i32 2, i32 8, i64 93, i32 0, i32 4}
+!7 = !{!"DescriptorBuffer", i32 6, i32 128, i32 10, i32 8, i64 93, i32 1, i32 4}
+!8 = !{!"StreamOutTableVaPtr", i32 11, i32 2, i32 2, i32 1, i32 0}
+!9 = !{!"PushConst", i32 9, i32 128, i32 3, i32 10, i64 4294967295, i32 0, i32 4}
diff --git a/lgc/test/ShaderStages.lgc b/lgc/test/ShaderStages.lgc
index ce9c5c6ab3..e5d8e8de4c 100644
--- a/lgc/test/ShaderStages.lgc
+++ b/lgc/test/ShaderStages.lgc
@@ -1,7 +1,7 @@
 ; ----------------------------------------------------------------------
 ; Extract 1: CS
 
-; RUN: lgc -extract=1 -print-after=lgc-patch-setup-target-features -mcpu=gfx1010 %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK1,CHECK-NGG1 %s
+; RUN: lgc -extract=1 -print-after=lgc-set-up-target-features -mcpu=gfx1010 %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK1,CHECK-NGG1 %s
 ; CHECK-NGG1: define dllexport amdgpu_cs void @_amdgpu_cs_main{{.*}} !lgc.shaderstage [[cs_stage:![0-9]*]] {
 ; CHECK1: [[cs_stage]] = !{i32 7}
 
@@ -27,7 +27,7 @@ attributes #0 = { nounwind }
 ; ----------------------------------------------------------------------
 ; Extract 2: VS/FS
 
-; RUN: lgc -extract=2 -print-after=lgc-patch-setup-target-features -mcpu=gfx1010 %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK-NGG2 %s
+; RUN: lgc -extract=2 -print-after=lgc-set-up-target-features -mcpu=gfx1010 %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK-NGG2 %s
 
 ; CHECK-NGG2: define dllexport amdgpu_gs void @_amdgpu_gs_main{{.*}} !lgc.shaderstage [[vert_stage:![0-9]*]] {
 ; CHECK-NGG2: define dllexport amdgpu_ps void @_amdgpu_ps_main{{.*}} !lgc.shaderstage [[frag_stage:![0-9]*]] {
@@ -63,7 +63,7 @@ attributes #0 = { nounwind }
 ; ----------------------------------------------------------------------
 ; Extract 3: GS/VS
 
-; RUN: lgc -extract=3 -print-after=lgc-patch-setup-target-features -mcpu=gfx1010 %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK-NGG3 %s
+; RUN: lgc -extract=3 -print-after=lgc-set-up-target-features -mcpu=gfx1010 %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK-NGG3 %s
 
 ; _amdgpu_gs_main must be first, so it can be linked with a potential vertex fetch shader.
 ; CHECK-NGG3: define dllexport amdgpu_gs void @_amdgpu_gs_main{{.*}} !lgc.shaderstage [[geom_stage:![0-9]*]] {
@@ -84,8 +84,7 @@ define dllexport spir_func void @lgc.shader.GS.main() local_unnamed_addr #0 !lgc
   call void @lgc.output.export.builtin.PointSize.i32.i32.f32(i32 1, i32 0, float undef) #0
   call void @lgc.output.export.builtin.ClipDistance.i32.i32.a1f32(i32 3, i32 0, [1 x float] undef) #0
   call void @lgc.output.export.builtin.CullDistance.i32.i32.a1f32(i32 4, i32 0, [1 x float] undef) #0
-  %0 = call i32 @lgc.input.import.builtin.GsWaveId.i32.i32(i32 268435466) #0
-  call void @llvm.amdgcn.s.sendmsg(i32 34, i32 %0)
+  call void @lgc.gs.emit.stream(i32 0)
   ret void
 }
 
@@ -119,13 +118,11 @@ declare float @lgc.input.import.generic__f32.i32.i32.i32(i32, i32, i32) #1
 ; Function Attrs: nounwind readonly
 declare <4 x double> @lgc.input.import.generic__v4f64.i32.i32.i32(i32, i32, i32) #1
 
-; Function Attrs: nounwind
-declare i32 @lgc.input.import.builtin.GsWaveId.i32.i32(i32) #0
-
-; Function Attrs: nounwind
-declare void @llvm.amdgcn.s.sendmsg(i32 immarg, i32) #0
+; Function Attrs: nounwind memory(write)
+declare void @lgc.gs.emit.stream(i32) #2
 
 attributes #0 = { nounwind }
+;attributes #2 = { nounwind memory(write) }
 
 !llpc.geometry.mode = !{!0}
 !lgc.options = !{!1}
@@ -149,7 +146,7 @@ attributes #0 = { nounwind }
 ; ----------------------------------------------------------------------
 ; Extract 4: TCS/TES
 
-; RUN: lgc -extract=4 -print-after=lgc-patch-setup-target-features -mcpu=gfx1010 %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK-NGG4 %s
+; RUN: lgc -extract=4 -print-after=lgc-set-up-target-features -mcpu=gfx1010 %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK-NGG4 %s
 
 ; CHECK-NGG4: define dllexport amdgpu_gs void @_amdgpu_gs_main{{.*}} !lgc.shaderstage [[tc_stage:![0-9]*]] {
 ; CHECK-NGG4: define dllexport amdgpu_hs void @_amdgpu_hs_main{{.*}} !lgc.shaderstage [[te_stage:![0-9]*]] {
@@ -220,7 +217,7 @@ attributes #1 = { nounwind readonly }
 ; ----------------------------------------------------------------------
 ; Extract 5: TCS
 
-; RUN: lgc -extract=5 -print-after=lgc-patch-setup-target-features -mcpu=gfx1010 %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK-NGG5 %s
+; RUN: lgc -extract=5 -print-after=lgc-set-up-target-features -mcpu=gfx1010 %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK-NGG5 %s
 
 ; CHECK-NGG5: define dllexport amdgpu_hs void @_amdgpu_hs_main{{.*}} !lgc.shaderstage [[tc_stage:![0-9]*]] {
 ; CHECK-NGG5: [[tc_stage]] = !{i32 2}
@@ -273,7 +270,7 @@ attributes #1 = { nounwind readonly }
 ; ----------------------------------------------------------------------
 ; Extract 6: TES
 
-; RUN: lgc -extract=6 -print-after=lgc-patch-setup-target-features -mcpu=gfx1010 %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK-NGG6 %s
+; RUN: lgc -extract=6 -print-after=lgc-set-up-target-features -mcpu=gfx1010 %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK-NGG6 %s
 
 ; CHECK-NGG6: define dllexport amdgpu_gs void @_amdgpu_gs_main{{.*}} !lgc.shaderstage [[te_stage:![0-9]*]] {
 ; CHECK-NGG6: define dllexport amdgpu_ps void @_amdgpu_ps_main{{.*}} !lgc.shaderstage [[frag_stage:![0-9]*]] {
@@ -327,7 +324,7 @@ attributes #1 = { nounwind readonly }
 ; ----------------------------------------------------------------------
 ; Extract 7: TCS/TES/GS
 
-; RUN: lgc -extract=7 -print-after=lgc-patch-setup-target-features -mcpu=gfx1010 %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK-NGG7 %s
+; RUN: lgc -extract=7 -print-after=lgc-set-up-target-features -mcpu=gfx1010 %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK-NGG7 %s
 
 ; When there is are tes and geom shader, _amdgpu_hs_main must be first, so it can be linked with a potential
 ; vertex fetch shader.
@@ -373,8 +370,7 @@ define dllexport spir_func void @lgc.shader.GS.main() local_unnamed_addr #0 !lgc
   call void @lgc.output.export.builtin.PointSize.i32.i32.f32(i32 1, i32 0, float undef) #0
   call void @lgc.output.export.builtin.ClipDistance.i32.i32.a1f32(i32 3, i32 0, [1 x float] undef) #0
   call void @lgc.output.export.builtin.CullDistance.i32.i32.a1f32(i32 4, i32 0, [1 x float] undef) #0
-  %0 = call i32 @lgc.input.import.builtin.GsWaveId.i32.i32(i32 268435466) #0
-  call void @llvm.amdgcn.s.sendmsg(i32 34, i32 %0)
+  call void @lgc.gs.emit.stream(i32 0)
   ret void
 }
 
@@ -428,14 +424,12 @@ declare float @lgc.input.import.generic__f32.i32.i32.i32(i32, i32, i32) #1
 ; Function Attrs: nounwind readonly
 declare <4 x double> @lgc.input.import.generic__v4f64.i32.i32.i32(i32, i32, i32) #1
 
-; Function Attrs: nounwind
-declare i32 @lgc.input.import.builtin.GsWaveId.i32.i32(i32) #0
-
-; Function Attrs: nounwind
-declare void @llvm.amdgcn.s.sendmsg(i32 immarg, i32) #0
+; Function Attrs: nounwind memory(write)
+declare void @lgc.gs.emit.stream(i32) #2
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readonly }
+attributes #2 = { nounwind memory(write) }
 
 !llpc.tcs.mode = !{!0}
 !llpc.geometry.mode = !{!0}
diff --git a/lgc/test/TaskShaderOps.lgc b/lgc/test/TaskShaderOps.lgc
index b660de8feb..2cd877c367 100644
--- a/lgc/test/TaskShaderOps.lgc
+++ b/lgc/test/TaskShaderOps.lgc
@@ -34,9 +34,9 @@
 ; CHECK-NEXT: [[baseAddr:%[0-9]*]] = bitcast <2 x i32> [[baseAddrTmp1]] to i64
 ; CHECK-NEXT: [[wrappedEntryIndex64:%[0-9]*]] = zext i32 [[entryOffset]] to i64
 ; CHECK-NEXT: [[newBaseAddr:%[0-9]*]] = add nuw nsw i64 [[baseAddr]], [[wrappedEntryIndex64]]
-; CHECK-NEXT: [[newDescWord0:%[0-9]*]] = trunc i64 [[newBaseAddr]] to i32
-; CHECK-NEXT: [[newBaseAddrHi64:%[a-z.]*]] = lshr i64 [[newBaseAddr]], 32
-; CHECK-NEXT: [[newBaseAddrHi32:%[0-9]*]] = trunc {{(nuw nsw )?}}i64 [[newBaseAddrHi64]] to i32
+; CHECK-NEXT: [[newDescWorkBoth:%[0-9]*]] = bitcast i64 [[newBaseAddr]] to <2 x i32>
+; CHECK-NEXT: [[newDescWord0:%[0-9]*]] = extractelement <2 x i32> [[newDescWorkBoth]], i64 0
+; CHECK-NEXT: [[newBaseAddrHi32:%[0-9]*]] = extractelement <2 x i32> [[newDescWorkBoth]], i64 1
 ; CHECK-NEXT: [[newBaseAddrHi:%[0-9]*]] = and i32 [[newBaseAddrHi32]], 65535
 ; CHECK-NEXT: [[newDescWord1Tmp:%[0-9]*]] = and i32 [[descWord1]], -65536
 ; CHECK-NEXT: [[newDescWord1:%[0-9]*]] = or {{(disjoint )?}}i32 [[newDescWord1Tmp]], [[newBaseAddrHi]]
diff --git a/lgc/test/TaskShaderRegConfig.lgc b/lgc/test/TaskShaderRegConfig.lgc
index fddcfe6e49..79071253f1 100644
--- a/lgc/test/TaskShaderRegConfig.lgc
+++ b/lgc/test/TaskShaderRegConfig.lgc
@@ -53,7 +53,7 @@ attributes #0 = { nounwind }
 ; CHECK-NEXT:       .cs:
 ; CHECK-NEXT:         .checksum_value: 0xbbc4ff6d
 ; CHECK-NEXT:         .debug_mode:     false
-; CHECK-NEXT:         .entry_point:    _amdgpu_cs_main
+; CHECK-NEXT:         .entry_point{{(_symbol)?}}:    _amdgpu_cs_main
 ; CHECK-NEXT:         .excp_en:        0
 ; CHECK-NEXT:         .float_mode:     0xc0
 ; CHECK-NEXT:         .ieee_mode:      false
diff --git a/lgc/test/Transforms/Continufy/simple.lgc b/lgc/test/Transforms/Continufy/simple.lgc
deleted file mode 100644
index 35d1dc7cf3..0000000000
--- a/lgc/test/Transforms/Continufy/simple.lgc
+++ /dev/null
@@ -1,83 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --function-signature
-; RUN: lgc -mcpu=gfx1030 -o - -passes="require<lgc-pipeline-state>,lgc-continufy" %s | FileCheck --check-prefixes=CHECK %s
-
-define spir_func void @raygen() !lgc.shaderstage !{i32 7} !continufy.stage !{i32 0} {
-; CHECK-LABEL: define {{[^@]+}}@raygen
-; CHECK-SAME: ({} [[STATE:%.*]], i32 [[RCR:%.*]], i32 [[SHADER_INDEX:%.*]]) !lgc.shaderstage [[META2:![0-9]+]] !continufy.stage [[META3:![0-9]+]] !lgc.cps [[META4:![0-9]+]] {
-; CHECK-NEXT:    [[PUSHCONST:%.*]] = call ptr addrspace(4) @lgc.user.data(i32 0)
-; CHECK-NEXT:    [[FN:%.*]] = load ptr, ptr addrspace(4) [[PUSHCONST]], align 8
-; CHECK-NEXT:    [[P8:%.*]] = getelementptr i8, ptr addrspace(4) [[PUSHCONST]], i32 8
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr addrspace(4) [[P8]], align 4
-; CHECK-NEXT:    [[P16:%.*]] = getelementptr i8, ptr addrspace(4) [[PUSHCONST]], i32 16
-; CHECK-NEXT:    [[DST:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[P16]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[FN]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = or i32 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP3:%.*]] = call [2 x i32] (...) @lgc.cps.await__a2i32(i32 [[TMP2]], i32 8, i32 poison, i32 [[X]], ptr addrspace(1) [[DST]]), !continuation.returnedRegistercount [[META3]]
-; CHECK-NEXT:    store [2 x i32] [[TMP3]], ptr addrspace(1) [[DST]], align 4
-; CHECK-NEXT:    call void @lgc.cps.complete()
-; CHECK-NEXT:    unreachable
-;
-  %pushconst = call ptr addrspace(4) @lgc.user.data(i32 0)
-  %fn = load ptr, ptr addrspace(4) %pushconst
-  %p8 = getelementptr i8, ptr addrspace(4) %pushconst, i32 8
-  %x = load i32, ptr addrspace(4) %p8
-  %p16 = getelementptr i8, ptr addrspace(4) %pushconst, i32 16
-  %dst = load ptr addrspace(1), ptr addrspace(4) %p16
-  %r = call spir_func [2 x i32] %fn(i32 %x, ptr addrspace(1) %dst), !continufy.stage !{i32 -1}
-  store [2 x i32] %r, ptr addrspace(1) %dst
-  ret void
-}
-
-define spir_func i32 @chs(i32 %x) !lgc.shaderstage !{i32 7} !continufy.stage !{i32 3} {
-; CHECK-LABEL: define {{[^@]+}}@chs
-; CHECK-SAME: ({} [[STATE:%.*]], i32 [[RCR:%.*]], i32 [[SHADER_INDEX:%.*]], i32 [[X:%.*]]) !lgc.shaderstage [[META2]] !continufy.stage [[META5:![0-9]+]] !lgc.cps [[META6:![0-9]+]] {
-; CHECK-NEXT:    [[PUSHCONST:%.*]] = call ptr addrspace(4) @lgc.user.data(i32 24)
-; CHECK-NEXT:    [[FN:%.*]] = load ptr, ptr addrspace(4) [[PUSHCONST]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[FN]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = or i32 [[TMP1]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 (...) @lgc.cps.await__i32(i32 [[TMP2]], i32 4, i32 poison, i32 [[X]]), !continuation.returnedRegistercount [[META3]]
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR]], i32 8, i32 poison, i32 poison, i32 poison, i32 [[TMP3]])
-; CHECK-NEXT:    unreachable
-;
-  %pushconst = call ptr addrspace(4) @lgc.user.data(i32 24)
-  %fn = load ptr, ptr addrspace(4) %pushconst
-  %y = call spir_func i32 %fn(i32 %x), !continufy.stage !{i32 5}
-  ret i32 %y
-}
-
-; Note: No !continufy.stage metadata here
-define dllexport void @lgc.shader.CS.main() !lgc.shaderstage !{i32 7} {
-; CHECK-LABEL: define {{[^@]+}}@lgc.shader.CS.main
-; CHECK-SAME: () !lgc.shaderstage [[META2]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ID:%.*]] = call i32 @lgc.shader.input.LocalInvocationId(i32 49)
-; CHECK-NEXT:    [[LIVE:%.*]] = icmp ult i32 [[ID]], 29
-; CHECK-NEXT:    br i1 [[LIVE]], label [[MAIN:%.*]], label [[EXIT:%.*]]
-; CHECK:       main:
-; CHECK-NEXT:    [[PUSHCONST:%.*]] = call ptr addrspace(4) @lgc.user.data(i32 32)
-; CHECK-NEXT:    [[FN:%.*]] = load ptr, ptr addrspace(4) [[PUSHCONST]], align 8
-; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[FN]] to i32
-; CHECK-NEXT:    [[TMP1:%.*]] = or i32 [[TMP0]], 1
-; CHECK-NEXT:    call void (...) @lgc.cps.await__isVoid(i32 [[TMP1]], i32 2, i32 poison), !continuation.returnedRegistercount [[META3]]
-; CHECK-NEXT:    br label [[EXIT]]
-; CHECK:       exit:
-; CHECK-NEXT:    call void @lgc.cps.complete()
-; CHECK-NEXT:    unreachable
-;
-entry:
-  %id = call i32 @lgc.shader.input.LocalInvocationId(i32 49)
-  %live = icmp ult i32 %id, 29
-  br i1 %live, label %main, label %exit
-
-main:
-  %pushconst = call ptr addrspace(4) @lgc.user.data(i32 32)
-  %fn = load ptr, ptr addrspace(4) %pushconst
-  call spir_func void %fn(), !continufy.stage !{i32 0}
-  br label %exit
-
-exit:
-  ret void
-}
-
-declare ptr addrspace(4) @lgc.user.data(i32)
-declare i32 @lgc.shader.input.LocalInvocationId(i32)
diff --git a/lgc/test/Transforms/CpsLowering/continuation-basic.lgc b/lgc/test/Transforms/CpsLowering/continuation-basic.lgc
index 8c56b0d423..172d8d82f2 100644
--- a/lgc/test/Transforms/CpsLowering/continuation-basic.lgc
+++ b/lgc/test/Transforms/CpsLowering/continuation-basic.lgc
@@ -5,7 +5,7 @@ declare void @lgc.cps.jump(...) #0
 
 define void @test(i32 %cspInit, i32 %arg, ptr %table, i32 %rcr) !lgc.cps !1 !lgc.shaderstage !2 !continuation !3 !continuation.state !4 {
 ; CHECK-LABEL: define {{[^@]+}}@test
-; CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[PAD0:%.*]], i32 inreg [[PAD1:%.*]], i32 inreg [[PAD2:%.*]], i32 inreg [[PAD3:%.*]], i32 inreg [[PAD4:%.*]], i32 inreg [[PAD5:%.*]], i32 inreg [[PAD6:%.*]], i32 inreg [[PAD7:%.*]], i32 inreg [[PAD8:%.*]], i32 inreg [[PAD9:%.*]], i32 inreg [[PAD10:%.*]], i32 inreg [[PAD11:%.*]], i32 inreg [[SPILLTABLE:%.*]], <3 x i32> inreg [[WORKGROUPID:%.*]], i32 inreg [[MULTIDISPATCHINFO:%.*]], <3 x i32> [[LOCALINVOCATIONID:%.*]], i32 [[VCR:%.*]], i32 [[CSPINIT:%.*]], i32 [[ARG:%.*]], ptr [[TABLE:%.*]], i32 [[RCR:%.*]]) #[[ATTR1:[0-9]+]] align 64 !lgc.cps [[META3:![0-9]+]] !lgc.shaderstage [[META4:![0-9]+]] !continuation [[META5:![0-9]+]] !continuation.state [[META6:![0-9]+]] {
+; CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[PAD0:%.*]], i32 inreg [[PAD1:%.*]], i32 inreg [[PAD2:%.*]], i32 inreg [[PAD3:%.*]], i32 inreg [[PAD4:%.*]], i32 inreg [[PAD5:%.*]], i32 inreg [[PAD6:%.*]], i32 inreg [[PAD7:%.*]], i32 inreg [[PAD8:%.*]], i32 inreg [[PAD9:%.*]], i32 inreg [[PAD10:%.*]], i32 inreg [[PAD11:%.*]], i32 inreg [[SPILLTABLE:%.*]], <3 x i32> inreg [[WORKGROUPID:%.*]], i32 inreg [[MULTIDISPATCHINFO:%.*]], <3 x i32> [[LOCALINVOCATIONID:%.*]], i32 [[VCR:%.*]], i32 [[CSPINIT:%.*]], i32 [[ARG:%.*]], ptr [[TABLE:%.*]], i32 [[RCR:%.*]]) #[[ATTR1:[0-9]+]] align 64 !lgc.cps [[META4:![0-9]+]] !lgc.shaderstage [[META5:![0-9]+]] !continuation [[META6:![0-9]+]] !continuation.state [[META7:![0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
@@ -22,66 +22,67 @@ define void @test(i32 %cspInit, i32 %arg, ptr %table, i32 %rcr) !lgc.cps !1 !lgc
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[CSP]], align 4
 ; CHECK-NEXT:    br label [[TAIL_BLOCK:%.*]]
 ; CHECK:       tail.block:
-; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32 } poison, <3 x i32> [[LOCALINVOCATIONID]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32 } [[TMP8]], i32 [[CR_THEN]], 1
-; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32 } [[TMP9]], i32 [[TMP7]], 2
-; CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32 } [[TMP10]], i32 [[RCR]], 3
-; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32 } [[TMP11]], i32 [[THEN_ARG]], 4
-; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <3 x i32>, i32, i32, i32, i32 } [[TMP12]], 1
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 [[TMP13]], i32 [[VCR]])
-; CHECK-NEXT:    [[TMP15:%.*]] = and i32 [[TMP14]], 7
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
-; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP16]])
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP15]], 3
-; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP18]])
-; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
-; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[TMP19]], i32 [[TMP17]]
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[TMP15]], 2
-; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP22]])
-; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
-; CHECK-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 [[TMP23]], i32 [[TMP21]]
-; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[TMP15]], 1
-; CHECK-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP26]])
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0
-; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP27]], i32 [[TMP25]]
-; CHECK-NEXT:    [[TMP30:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP29]], i1 true)
-; CHECK-NEXT:    [[TMP31:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP14]], i32 [[TMP30]])
-; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[TMP14]], [[TMP31]]
-; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP32]])
-; CHECK-NEXT:    [[TMP34:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP31]])
-; CHECK-NEXT:    [[TMP35:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP33]])
-; CHECK-NEXT:    [[TMP36:%.*]] = and i32 [[TMP34]], -64
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP36]], i64 0
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i32> [[TMP37]] to i64
-; CHECK-NEXT:    [[TMP39:%.*]] = inttoptr i64 [[TMP38]] to ptr
-; CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
-; CHECK-NEXT:    [[TMP41:%.*]] = bitcast i64 [[TMP40]] to <2 x i32>
-; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i32> [[TMP41]], i64 0
-; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <2 x i32> [[TMP41]], i64 1
-; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
-; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
-; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
-; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <20 x i32> [[TMP47]], i32 [[TMP42]], i64 1
-; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <20 x i32> [[TMP48]], i32 [[TMP43]], i64 2
-; CHECK-NEXT:    [[TMP50:%.*]] = insertelement <20 x i32> [[TMP49]], i32 [[PAD0]], i64 3
-; CHECK-NEXT:    [[TMP51:%.*]] = insertelement <20 x i32> [[TMP50]], i32 [[PAD1]], i64 4
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <20 x i32> [[TMP51]], i32 [[PAD2]], i64 5
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <20 x i32> [[TMP52]], i32 [[PAD3]], i64 6
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <20 x i32> [[TMP53]], i32 [[PAD4]], i64 7
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <20 x i32> [[TMP54]], i32 [[PAD5]], i64 8
-; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <20 x i32> [[TMP55]], i32 [[PAD6]], i64 9
-; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <20 x i32> [[TMP56]], i32 [[PAD7]], i64 10
-; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <20 x i32> [[TMP57]], i32 [[PAD8]], i64 11
-; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <20 x i32> [[TMP58]], i32 [[PAD9]], i64 12
-; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <20 x i32> [[TMP59]], i32 [[PAD10]], i64 13
-; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <20 x i32> [[TMP60]], i32 [[PAD11]], i64 14
-; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <20 x i32> [[TMP61]], i32 [[SPILLTABLE]], i64 15
-; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <20 x i32> [[TMP62]], i32 [[TMP44]], i64 16
-; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <20 x i32> [[TMP63]], i32 [[TMP45]], i64 17
-; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <20 x i32> [[TMP64]], i32 [[TMP46]], i64 18
-; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <20 x i32> [[TMP65]], i32 [[MULTIDISPATCHINFO]], i64 19
-; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, i32, i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32i32i32i32s(ptr inreg [[TMP39]], i32 inreg [[TMP35]], <20 x i32> inreg [[TMP66]], { <3 x i32>, i32, i32, i32, i32 } [[TMP12]], i32 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } poison, <3 x i32> [[LOCALINVOCATIONID]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP8]], i32 [[CR_THEN]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP9]], i32 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP10]], i32 poison, 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP11]], i32 [[RCR]], 4
+; CHECK-NEXT:    [[TMP13:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP12]], i32 [[THEN_ARG]], 5
+; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP13]], 1
+; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 [[TMP14]], i32 [[VCR]])
+; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 7
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne i32 [[TMP16]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP17]])
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP16]], 3
+; CHECK-NEXT:    [[TMP20:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP19]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
+; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[TMP20]], i32 [[TMP18]]
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP16]], 2
+; CHECK-NEXT:    [[TMP24:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP23]])
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0
+; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP24]], i32 [[TMP22]]
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i32 [[TMP16]], 1
+; CHECK-NEXT:    [[TMP28:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP27]])
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
+; CHECK-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[TMP28]], i32 [[TMP26]]
+; CHECK-NEXT:    [[TMP31:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP30]], i1 true)
+; CHECK-NEXT:    [[TMP32:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP15]], i32 [[TMP31]])
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp eq i32 [[TMP15]], [[TMP32]]
+; CHECK-NEXT:    [[TMP34:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP33]])
+; CHECK-NEXT:    [[TMP35:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP32]])
+; CHECK-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP34]])
+; CHECK-NEXT:    [[TMP37:%.*]] = and i32 [[TMP35]], -64
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP37]], i64 0
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <2 x i32> [[TMP38]] to i64
+; CHECK-NEXT:    [[TMP40:%.*]] = inttoptr i64 [[TMP39]] to ptr
+; CHECK-NEXT:    [[TMP41:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
+; CHECK-NEXT:    [[TMP42:%.*]] = bitcast i64 [[TMP41]] to <2 x i32>
+; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <2 x i32> [[TMP42]], i64 0
+; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <2 x i32> [[TMP42]], i64 1
+; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
+; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
+; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
+; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <20 x i32> [[TMP48]], i32 [[TMP43]], i64 1
+; CHECK-NEXT:    [[TMP50:%.*]] = insertelement <20 x i32> [[TMP49]], i32 [[TMP44]], i64 2
+; CHECK-NEXT:    [[TMP51:%.*]] = insertelement <20 x i32> [[TMP50]], i32 [[PAD0]], i64 3
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <20 x i32> [[TMP51]], i32 [[PAD1]], i64 4
+; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <20 x i32> [[TMP52]], i32 [[PAD2]], i64 5
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <20 x i32> [[TMP53]], i32 [[PAD3]], i64 6
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <20 x i32> [[TMP54]], i32 [[PAD4]], i64 7
+; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <20 x i32> [[TMP55]], i32 [[PAD5]], i64 8
+; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <20 x i32> [[TMP56]], i32 [[PAD6]], i64 9
+; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <20 x i32> [[TMP57]], i32 [[PAD7]], i64 10
+; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <20 x i32> [[TMP58]], i32 [[PAD8]], i64 11
+; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <20 x i32> [[TMP59]], i32 [[PAD9]], i64 12
+; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <20 x i32> [[TMP60]], i32 [[PAD10]], i64 13
+; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <20 x i32> [[TMP61]], i32 [[PAD11]], i64 14
+; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <20 x i32> [[TMP62]], i32 [[SPILLTABLE]], i64 15
+; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <20 x i32> [[TMP63]], i32 [[TMP45]], i64 16
+; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <20 x i32> [[TMP64]], i32 [[TMP46]], i64 17
+; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <20 x i32> [[TMP65]], i32 [[TMP47]], i64 18
+; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <20 x i32> [[TMP66]], i32 [[MULTIDISPATCHINFO]], i64 19
+; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, i32, i32, i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32i32i32i32i32s(ptr inreg [[TMP40]], i32 inreg [[TMP36]], <20 x i32> inreg [[TMP67]], { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP13]], i32 0)
 ; CHECK-NEXT:    unreachable
 ;
 entry:
@@ -91,7 +92,7 @@ entry:
   %cr.then = load i32, ptr %table.0, align 4
   %then.arg = add i32 %arg, 1
   %0 = load i32, ptr %csp, align 4
-  call void (...) @lgc.cps.jump(i32 %cr.then, i32 2, i32 %0, i32 %rcr, i32 %then.arg)
+  call void (...) @lgc.cps.jump(i32 %cr.then, i32 2, i32 %0, i32 poison, i32 %rcr, i32 %then.arg)
   unreachable
 }
 
diff --git a/lgc/test/Transforms/CpsLowering/cps-entry-point.lgc b/lgc/test/Transforms/CpsLowering/cps-entry-point.lgc
index 81b3d94c7c..3e64a3ac94 100644
--- a/lgc/test/Transforms/CpsLowering/cps-entry-point.lgc
+++ b/lgc/test/Transforms/CpsLowering/cps-entry-point.lgc
@@ -24,7 +24,7 @@ define dllexport spir_func void @lgc.shader.CS.main() local_unnamed_addr #0 !lgc
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[USERDATA1]], i64 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[USERDATA2]], i64 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[USERDATA3]], i64 3
-; CHECK-NEXT:    [[PTR:%.*]] = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> [[TMP10]])
+; CHECK-NEXT:    [[PTR:%.*]] = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> [[TMP10]], i1 false)
 ; CHECK-NEXT:    [[P0:%.*]] = getelementptr i32, ptr addrspace(7) [[PTR]], i32 0
 ; CHECK-NEXT:    [[I_VSP:%.*]] = load i32, ptr addrspace(7) [[P0]], align 4
 ; CHECK-NEXT:    store i32 [[I_VSP]], ptr [[CSP]], align 4
@@ -84,7 +84,7 @@ define dllexport spir_func void @lgc.shader.CS.main() local_unnamed_addr #0 !lgc
 .entry:
   %csp = alloca i32, align 4
   %desc = call <4 x i32> @lgc.load.user.data__v4i32(i32 0)
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %p0 = getelementptr i32, ptr addrspace(7) %ptr, i32 0
   %i_vsp = load i32, ptr addrspace(7) %p0, align 4
   store i32 %i_vsp, ptr %csp, align 4
@@ -99,7 +99,7 @@ define dllexport spir_func void @lgc.shader.CS.main() local_unnamed_addr #0 !lgc
 
 declare <4 x i32> @lgc.load.user.data__v4i32(i32) #3
 
-declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>) #4
+declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>, i1) #4
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind willreturn memory(inaccessiblemem: write) }
diff --git a/lgc/test/Transforms/CpsLowering/cps-from-continufy.lgc b/lgc/test/Transforms/CpsLowering/cps-from-continufy.lgc
deleted file mode 100644
index 22085927e9..0000000000
--- a/lgc/test/Transforms/CpsLowering/cps-from-continufy.lgc
+++ /dev/null
@@ -1,420 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 4
-; RUN: lgc -mcpu=gfx1030 -o - -passes="require<lgc-pipeline-state>,lgc-mutate-entry-point" %s | FileCheck --check-prefixes=CHECK %s
-
-define spir_func void @_rgen_1(i32 %cspInit, i32 %rcr) #0 !spirv.ExecutionModel !16 !lgc.shaderstage !17 !continuation !18 !lgc.cps !19 !continuation.state !20 {
-; CHECK-LABEL: define amdgpu_cs_chain void @_rgen_1(
-; CHECK-SAME: i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[USERDATA0:%.*]], i32 inreg [[USERDATA1:%.*]], i32 inreg [[USERDATA2:%.*]], i32 inreg [[USERDATA3:%.*]], i32 inreg [[USERDATA4:%.*]], i32 inreg [[USERDATA5:%.*]], i32 inreg [[PAD6:%.*]], i32 inreg [[PAD7:%.*]], i32 inreg [[PAD8:%.*]], i32 inreg [[PAD9:%.*]], i32 inreg [[PAD10:%.*]], i32 inreg [[PAD11:%.*]], i32 inreg [[SPILLTABLE:%.*]], <3 x i32> inreg [[WORKGROUPID:%.*]], i32 inreg [[MULTIDISPATCHINFO:%.*]], <3 x i32> [[LOCALINVOCATIONID:%.*]], i32 [[VCR:%.*]], i32 [[CSPINIT:%.*]], i32 [[RCR:%.*]]) #[[ATTR0:[0-9]+]] align 64 !spirv.ExecutionModel [[META16:![0-9]+]] !lgc.shaderstage [[META17:![0-9]+]] !continuation [[META18:![0-9]+]] !lgc.cps [[META19:![0-9]+]] !continuation.state [[META20:![0-9]+]] {
-; CHECK-NEXT:  .entry:
-; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[SPILLTABLE]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(4)
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.amdgcn.s.getpc()
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[TMP5]] to <2 x i32>
-; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 96
-; CHECK-NEXT:    store i32 [[TMP8]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.amdgcn.s.getpc()
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to <2 x i32>
-; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.amdgcn.s.getpc()
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i64 [[TMP11]] to <2 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.amdgcn.s.getpc()
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP13]] to <2 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[USERDATA5]], i64 0
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP15]] to i64
-; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr addrspace(4)
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP17]], i32 0
-; CHECK-NEXT:    [[TMP19:%.*]] = load <2 x i32>, ptr addrspace(4) [[TMP18]], align 8
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x i32> [[TMP19]], i64 0
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <2 x i32> [[TMP19]], i64 1
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32> poison, i32 [[TMP20]], i64 0
-; CHECK-NEXT:    [[TMP23:%.*]] = and i32 [[TMP21]], 65535
-; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP23]], i64 1
-; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <4 x i32> [[TMP24]], i32 -1, i64 2
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 553734060, i64 3
-; CHECK-NEXT:    [[TMP27:%.*]] = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> [[TMP26]])
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[USERDATA0]], i64 0
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <2 x i32> [[TMP28]] to i64
-; CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr addrspace(4)
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP30]], i32 32
-; CHECK-NEXT:    [[TMP32:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP31]], align 16
-; CHECK-NEXT:    [[TMP33:%.*]] = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> [[TMP32]])
-; CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i32 [[TMP7]] to ptr addrspace(5)
-; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP34]], i32 0
-; CHECK-NEXT:    store ptr addrspace(7) [[TMP33]], ptr addrspace(5) [[TMP35]], align 32
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[USERDATA0]], i64 0
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i32> [[TMP36]] to i64
-; CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr addrspace(4)
-; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP38]], i32 48
-; CHECK-NEXT:    [[TMP40:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP39]], align 16
-; CHECK-NEXT:    [[TMP41:%.*]] = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> [[TMP40]])
-; CHECK-NEXT:    [[TMP42:%.*]] = add i32 [[TMP7]], 8
-; CHECK-NEXT:    [[TMP43:%.*]] = inttoptr i32 [[TMP42]] to ptr addrspace(5)
-; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP43]], i32 0
-; CHECK-NEXT:    store ptr addrspace(7) [[TMP41]], ptr addrspace(5) [[TMP44]], align 32
-; CHECK-NEXT:    [[TMP45:%.*]] = load volatile i32, ptr addrspace(7) [[TMP41]], align 4
-; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP7]], 16
-; CHECK-NEXT:    [[TMP47:%.*]] = inttoptr i32 [[TMP46]] to ptr addrspace(5)
-; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP47]], i32 0
-; CHECK-NEXT:    store i32 [[TMP45]], ptr addrspace(5) [[TMP48]], align 4
-; CHECK-NEXT:    [[TMP49:%.*]] = add i32 [[TMP45]], -37
-; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i8, ptr addrspace(7) [[TMP27]], i32 52
-; CHECK-NEXT:    [[TMP51:%.*]] = load i64, ptr addrspace(7) [[TMP50]], align 8
-; CHECK-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i8, ptr addrspace(7) [[TMP27]], i32 60
-; CHECK-NEXT:    [[TMP53:%.*]] = load i32, ptr addrspace(7) [[TMP52]], align 4
-; CHECK-NEXT:    [[TMP54:%.*]] = mul i32 [[TMP49]], [[TMP53]]
-; CHECK-NEXT:    [[TMP55:%.*]] = inttoptr i64 [[TMP51]] to ptr addrspace(1)
-; CHECK-NEXT:    [[TMP56:%.*]] = sext i32 [[TMP54]] to i64
-; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP55]], i64 [[TMP56]]
-; CHECK-NEXT:    [[TMP58:%.*]] = load i64, ptr addrspace(1) [[TMP57]], align 8
-; CHECK-NEXT:    [[TMP59:%.*]] = inttoptr i64 [[TMP58]] to ptr
-; CHECK-NEXT:    [[TMP60:%.*]] = ptrtoint ptr [[TMP59]] to i32
-; CHECK-NEXT:    [[TMP61:%.*]] = or i32 [[TMP60]], 1
-; CHECK-NEXT:    [[TMP62:%.*]] = inttoptr i32 [[TMP61]] to ptr
-; CHECK-NEXT:    [[TMP63:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    br label [[TAIL_BLOCK:%.*]]
-; CHECK:       tail.block:
-; CHECK-NEXT:    [[TMP64:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } poison, <3 x i32> [[LOCALINVOCATIONID]], 0
-; CHECK-NEXT:    [[TMP65:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP64]], i32 [[TMP61]], 1
-; CHECK-NEXT:    [[TMP66:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP65]], i32 [[TMP63]], 2
-; CHECK-NEXT:    [[TMP67:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP66]], i32 add (i32 ptrtoint (ptr @_rgen_1.resume.0 to i32), i32 1), 3
-; CHECK-NEXT:    [[TMP68:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP67]], i32 undef, 4
-; CHECK-NEXT:    [[TMP69:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP68]], i32 [[TMP49]], 5
-; CHECK-NEXT:    [[TMP71:%.*]] = extractvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP69]], 1
-; CHECK-NEXT:    [[TMP72:%.*]] = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 [[TMP71]], i32 [[VCR]])
-; CHECK-NEXT:    [[TMP73:%.*]] = and i32 [[TMP72]], 7
-; CHECK-NEXT:    [[TMP74:%.*]] = icmp ne i32 [[TMP73]], 0
-; CHECK-NEXT:    [[TMP75:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP74]])
-; CHECK-NEXT:    [[TMP76:%.*]] = icmp eq i32 [[TMP73]], 3
-; CHECK-NEXT:    [[TMP77:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP76]])
-; CHECK-NEXT:    [[TMP78:%.*]] = icmp ne i32 [[TMP77]], 0
-; CHECK-NEXT:    [[TMP79:%.*]] = select i1 [[TMP78]], i32 [[TMP77]], i32 [[TMP75]]
-; CHECK-NEXT:    [[TMP80:%.*]] = icmp eq i32 [[TMP73]], 2
-; CHECK-NEXT:    [[TMP81:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP80]])
-; CHECK-NEXT:    [[TMP82:%.*]] = icmp ne i32 [[TMP81]], 0
-; CHECK-NEXT:    [[TMP83:%.*]] = select i1 [[TMP82]], i32 [[TMP81]], i32 [[TMP79]]
-; CHECK-NEXT:    [[TMP84:%.*]] = icmp eq i32 [[TMP73]], 1
-; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP84]])
-; CHECK-NEXT:    [[TMP86:%.*]] = icmp ne i32 [[TMP85]], 0
-; CHECK-NEXT:    [[TMP87:%.*]] = select i1 [[TMP86]], i32 [[TMP85]], i32 [[TMP83]]
-; CHECK-NEXT:    [[TMP88:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP87]], i1 true)
-; CHECK-NEXT:    [[TMP89:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP72]], i32 [[TMP88]])
-; CHECK-NEXT:    [[TMP90:%.*]] = icmp eq i32 [[TMP72]], [[TMP89]]
-; CHECK-NEXT:    [[TMP91:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP90]])
-; CHECK-NEXT:    [[TMP92:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP89]])
-; CHECK-NEXT:    [[TMP93:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP91]])
-; CHECK-NEXT:    [[TMP94:%.*]] = and i32 [[TMP92]], -64
-; CHECK-NEXT:    [[TMP95:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP94]], i64 0
-; CHECK-NEXT:    [[TMP96:%.*]] = bitcast <2 x i32> [[TMP95]] to i64
-; CHECK-NEXT:    [[TMP97:%.*]] = inttoptr i64 [[TMP96]] to ptr
-; CHECK-NEXT:    [[TMP98:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
-; CHECK-NEXT:    [[TMP99:%.*]] = bitcast i64 [[TMP98]] to <2 x i32>
-; CHECK-NEXT:    [[TMP100:%.*]] = extractelement <2 x i32> [[TMP99]], i64 0
-; CHECK-NEXT:    [[TMP101:%.*]] = extractelement <2 x i32> [[TMP99]], i64 1
-; CHECK-NEXT:    [[TMP102:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
-; CHECK-NEXT:    [[TMP103:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
-; CHECK-NEXT:    [[TMP104:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
-; CHECK-NEXT:    [[TMP105:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
-; CHECK-NEXT:    [[TMP106:%.*]] = insertelement <20 x i32> [[TMP105]], i32 [[TMP100]], i64 1
-; CHECK-NEXT:    [[TMP107:%.*]] = insertelement <20 x i32> [[TMP106]], i32 [[TMP101]], i64 2
-; CHECK-NEXT:    [[TMP108:%.*]] = insertelement <20 x i32> [[TMP107]], i32 [[USERDATA0]], i64 3
-; CHECK-NEXT:    [[TMP109:%.*]] = insertelement <20 x i32> [[TMP108]], i32 [[USERDATA1]], i64 4
-; CHECK-NEXT:    [[TMP110:%.*]] = insertelement <20 x i32> [[TMP109]], i32 [[USERDATA2]], i64 5
-; CHECK-NEXT:    [[TMP111:%.*]] = insertelement <20 x i32> [[TMP110]], i32 [[USERDATA3]], i64 6
-; CHECK-NEXT:    [[TMP112:%.*]] = insertelement <20 x i32> [[TMP111]], i32 [[USERDATA4]], i64 7
-; CHECK-NEXT:    [[TMP113:%.*]] = insertelement <20 x i32> [[TMP112]], i32 [[USERDATA5]], i64 8
-; CHECK-NEXT:    [[TMP114:%.*]] = insertelement <20 x i32> [[TMP113]], i32 [[PAD6]], i64 9
-; CHECK-NEXT:    [[TMP115:%.*]] = insertelement <20 x i32> [[TMP114]], i32 [[PAD7]], i64 10
-; CHECK-NEXT:    [[TMP116:%.*]] = insertelement <20 x i32> [[TMP115]], i32 [[PAD8]], i64 11
-; CHECK-NEXT:    [[TMP117:%.*]] = insertelement <20 x i32> [[TMP116]], i32 [[PAD9]], i64 12
-; CHECK-NEXT:    [[TMP118:%.*]] = insertelement <20 x i32> [[TMP117]], i32 [[PAD10]], i64 13
-; CHECK-NEXT:    [[TMP119:%.*]] = insertelement <20 x i32> [[TMP118]], i32 [[PAD11]], i64 14
-; CHECK-NEXT:    [[TMP120:%.*]] = insertelement <20 x i32> [[TMP119]], i32 [[SPILLTABLE]], i64 15
-; CHECK-NEXT:    [[TMP121:%.*]] = insertelement <20 x i32> [[TMP120]], i32 [[TMP102]], i64 16
-; CHECK-NEXT:    [[TMP122:%.*]] = insertelement <20 x i32> [[TMP121]], i32 [[TMP103]], i64 17
-; CHECK-NEXT:    [[TMP123:%.*]] = insertelement <20 x i32> [[TMP122]], i32 [[TMP104]], i64 18
-; CHECK-NEXT:    [[TMP124:%.*]] = insertelement <20 x i32> [[TMP123]], i32 [[MULTIDISPATCHINFO]], i64 19
-; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, i32, i32, i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32i32i32i32i32s(ptr inreg [[TMP97]], i32 inreg [[TMP93]], <20 x i32> inreg [[TMP124]], { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP69]], i32 0)
-; CHECK-NEXT:    unreachable
-;
-.entry:
-  %csp = alloca i32, align 4
-  store i32 %cspInit, ptr %csp, align 4
-  %0 = load i32, ptr %csp, align 4
-  %1 = add i32 %0, 96
-  store i32 %1, ptr %csp, align 4
-  %2 = call i64 @llvm.amdgcn.s.getpc()
-  %3 = bitcast i64 %2 to <2 x i32>
-  %4 = call i64 @llvm.amdgcn.s.getpc()
-  %5 = bitcast i64 %4 to <2 x i32>
-  %6 = call i64 @llvm.amdgcn.s.getpc()
-  %7 = bitcast i64 %6 to <2 x i32>
-  %8 = call i32 @lgc.load.user.data__i32(i32 20)
-  %9 = insertelement <2 x i32> %7, i32 %8, i64 0
-  %10 = bitcast <2 x i32> %9 to i64
-  %11 = inttoptr i64 %10 to ptr addrspace(4)
-  %12 = getelementptr i8, ptr addrspace(4) %11, i32 0
-  %13 = load <2 x i32>, ptr addrspace(4) %12, align 8
-  %14 = extractelement <2 x i32> %13, i64 0
-  %15 = extractelement <2 x i32> %13, i64 1
-  %16 = insertelement <4 x i32> poison, i32 %14, i64 0
-  %17 = and i32 %15, 65535
-  %18 = insertelement <4 x i32> %16, i32 %17, i64 1
-  %19 = insertelement <4 x i32> %18, i32 -1, i64 2
-  %20 = insertelement <4 x i32> %19, i32 553734060, i64 3
-  %21 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %20)
-  %22 = call i32 @lgc.load.user.data__i32(i32 0)
-  %23 = insertelement <2 x i32> %5, i32 %22, i64 0
-  %24 = bitcast <2 x i32> %23 to i64
-  %25 = inttoptr i64 %24 to ptr addrspace(4)
-  %26 = getelementptr i8, ptr addrspace(4) %25, i32 32
-  %27 = load <4 x i32>, ptr addrspace(4) %26, align 16
-  %28 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %27)
-  %29 = inttoptr i32 %0 to ptr addrspace(5)
-  %30 = getelementptr i8, ptr addrspace(5) %29, i32 0
-  store ptr addrspace(7) %28, ptr addrspace(5) %30, align 32
-  %31 = call i32 @lgc.load.user.data__i32(i32 0)
-  %32 = insertelement <2 x i32> %3, i32 %31, i64 0
-  %33 = bitcast <2 x i32> %32 to i64
-  %34 = inttoptr i64 %33 to ptr addrspace(4)
-  %35 = getelementptr i8, ptr addrspace(4) %34, i32 48
-  %36 = load <4 x i32>, ptr addrspace(4) %35, align 16
-  %37 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %36)
-  %38 = add i32 %0, 8
-  %39 = inttoptr i32 %38 to ptr addrspace(5)
-  %40 = getelementptr i8, ptr addrspace(5) %39, i32 0
-  store ptr addrspace(7) %37, ptr addrspace(5) %40, align 32
-  %41 = load volatile i32, ptr addrspace(7) %37, align 4
-  %42 = add i32 %0, 16
-  %43 = inttoptr i32 %42 to ptr addrspace(5)
-  %44 = getelementptr i8, ptr addrspace(5) %43, i32 0
-  store i32 %41, ptr addrspace(5) %44, align 4
-  %45 = add i32 %41, -37
-  %46 = getelementptr inbounds i8, ptr addrspace(7) %21, i32 52
-  %47 = load i64, ptr addrspace(7) %46, align 8
-  %48 = getelementptr inbounds i8, ptr addrspace(7) %21, i32 60
-  %49 = load i32, ptr addrspace(7) %48, align 4
-  %50 = mul i32 %45, %49
-  %51 = inttoptr i64 %47 to ptr addrspace(1)
-  %52 = sext i32 %50 to i64
-  %53 = getelementptr i8, ptr addrspace(1) %51, i64 %52
-  %54 = load i64, ptr addrspace(1) %53, align 8
-  %55 = inttoptr i64 %54 to ptr
-  %56 = ptrtoint ptr %55 to i32
-  %57 = or i32 %56, 1
-  %58 = inttoptr i32 %57 to ptr
-  %59 = call i32 (...) @lgc.cps.as.continuation.reference(ptr @_rgen_1.resume.0)
-  %60 = load i32, ptr %csp, align 4
-  call void (...) @lgc.cps.jump(i32 %57, i32 2, i32 %60, i32 %59, [1 x i32] undef, i32 %45)
-  unreachable
-}
-
-define void @_rgen_1.resume.0(i32 %cspInit, i32 %0, [1 x i32] %1) !spirv.ExecutionModel !16 !lgc.shaderstage !17 !continuation !21 !lgc.cps !19 !continuation.state !20 {
-; CHECK-LABEL: define amdgpu_cs_chain void @_rgen_1.resume.0(
-; CHECK-SAME: i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[USERDATA0:%.*]], i32 inreg [[USERDATA1:%.*]], i32 inreg [[USERDATA2:%.*]], i32 inreg [[USERDATA3:%.*]], i32 inreg [[USERDATA4:%.*]], i32 inreg [[USERDATA5:%.*]], i32 inreg [[PAD6:%.*]], i32 inreg [[PAD7:%.*]], i32 inreg [[PAD8:%.*]], i32 inreg [[PAD9:%.*]], i32 inreg [[PAD10:%.*]], i32 inreg [[PAD11:%.*]], i32 inreg [[SPILLTABLE:%.*]], <3 x i32> inreg [[WORKGROUPID:%.*]], i32 inreg [[MULTIDISPATCHINFO:%.*]], <3 x i32> [[LOCALINVOCATIONID:%.*]], i32 [[VCR:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]], [1 x i32] [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] align 64 !spirv.ExecutionModel [[META16]] !lgc.shaderstage [[META17]] !continuation [[META21:![0-9]+]] !lgc.cps [[META19]] !continuation.state [[META20]] {
-; CHECK-NEXT:  entryresume.0:
-; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.amdgcn.s.getpc()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[TMP2]] to <2 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[SPILLTABLE]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr addrspace(4)
-; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.amdgcn.s.getpc()
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP7]] to <2 x i32>
-; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], -96
-; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], 16
-; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i32 [[TMP11]] to ptr addrspace(5)
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP12]], i32 0
-; CHECK-NEXT:    [[DOTRELOAD6:%.*]] = load i32, ptr addrspace(5) [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP10]], 8
-; CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i32 [[TMP14]] to ptr addrspace(5)
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP15]], i32 0
-; CHECK-NEXT:    [[DOTRELOAD3:%.*]] = load ptr addrspace(7), ptr addrspace(5) [[TMP16]], align 32
-; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i32 [[TMP10]] to ptr addrspace(5)
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP17]], i32 0
-; CHECK-NEXT:    [[DOTRELOAD:%.*]] = load ptr addrspace(7), ptr addrspace(5) [[TMP18]], align 32
-; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[USERDATA5]], 24
-; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP10]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i32 [[TMP20]] to ptr addrspace(5)
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP21]], i32 0
-; CHECK-NEXT:    [[DUMMY_RELOAD:%.*]] = load ptr addrspace(7), ptr addrspace(5) [[TMP22]], align 32
-; CHECK-NEXT:    [[TMP23:%.*]] = load volatile i32, ptr addrspace(7) [[DOTRELOAD3]], align 4
-; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[DOTRELOAD6]], [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = zext i1 [[TMP24]] to i32
-; CHECK-NEXT:    store i32 [[TMP25]], ptr addrspace(7) [[DOTRELOAD]], align 4
-; CHECK-NEXT:    br label [[TAIL_BLOCK:%.*]]
-; CHECK:       tail.block:
-; CHECK-NEXT:    [[TMP26:%.*]] = insertvalue { <3 x i32>, i32, i32 } poison, <3 x i32> [[LOCALINVOCATIONID]], 0
-; CHECK-NEXT:    [[TMP27:%.*]] = insertvalue { <3 x i32>, i32, i32 } [[TMP26]], i32 0, 1
-; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <3 x i32>, i32, i32 } [[TMP27]], i32 poison, 2
-; CHECK-NEXT:    [[TMP29:%.*]] = extractvalue { <3 x i32>, i32, i32 } [[TMP28]], 1
-; CHECK-NEXT:    [[TMP30:%.*]] = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 [[TMP29]], i32 [[VCR]])
-; CHECK-NEXT:    [[TMP31:%.*]] = and i32 [[TMP30]], 7
-; CHECK-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
-; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP32]])
-; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i32 [[TMP31]], 3
-; CHECK-NEXT:    [[TMP35:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP34]])
-; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne i32 [[TMP35]], 0
-; CHECK-NEXT:    [[TMP37:%.*]] = select i1 [[TMP36]], i32 [[TMP35]], i32 [[TMP33]]
-; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i32 [[TMP31]], 2
-; CHECK-NEXT:    [[TMP39:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP38]])
-; CHECK-NEXT:    [[TMP40:%.*]] = icmp ne i32 [[TMP39]], 0
-; CHECK-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP39]], i32 [[TMP37]]
-; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i32 [[TMP31]], 1
-; CHECK-NEXT:    [[TMP43:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP42]])
-; CHECK-NEXT:    [[TMP44:%.*]] = icmp ne i32 [[TMP43]], 0
-; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP43]], i32 [[TMP41]]
-; CHECK-NEXT:    [[TMP46:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP45]], i1 true)
-; CHECK-NEXT:    [[TMP47:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP30]], i32 [[TMP46]])
-; CHECK-NEXT:    [[TMP48:%.*]] = icmp eq i32 [[TMP30]], [[TMP47]]
-; CHECK-NEXT:    [[TMP49:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP48]])
-; CHECK-NEXT:    [[TMP50:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP47]])
-; CHECK-NEXT:    [[TMP51:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP49]])
-; CHECK-NEXT:    [[TMP52:%.*]] = icmp eq i32 [[TMP50]], 0
-; CHECK-NEXT:    br i1 [[TMP52]], label [[RET_BLOCK:%.*]], label [[CHAIN_BLOCK:%.*]]
-; CHECK:       chain.block:
-; CHECK-NEXT:    [[TMP53:%.*]] = and i32 [[TMP50]], -64
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP53]], i64 0
-; CHECK-NEXT:    [[TMP55:%.*]] = bitcast <2 x i32> [[TMP54]] to i64
-; CHECK-NEXT:    [[TMP56:%.*]] = inttoptr i64 [[TMP55]] to ptr
-; CHECK-NEXT:    [[TMP57:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
-; CHECK-NEXT:    [[TMP58:%.*]] = bitcast i64 [[TMP57]] to <2 x i32>
-; CHECK-NEXT:    [[TMP59:%.*]] = extractelement <2 x i32> [[TMP58]], i64 0
-; CHECK-NEXT:    [[TMP60:%.*]] = extractelement <2 x i32> [[TMP58]], i64 1
-; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
-; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
-; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
-; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
-; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <20 x i32> [[TMP64]], i32 [[TMP59]], i64 1
-; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <20 x i32> [[TMP65]], i32 [[TMP60]], i64 2
-; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <20 x i32> [[TMP66]], i32 [[USERDATA0]], i64 3
-; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <20 x i32> [[TMP67]], i32 [[USERDATA1]], i64 4
-; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <20 x i32> [[TMP68]], i32 [[USERDATA2]], i64 5
-; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <20 x i32> [[TMP69]], i32 [[USERDATA3]], i64 6
-; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <20 x i32> [[TMP70]], i32 [[USERDATA4]], i64 7
-; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <20 x i32> [[TMP71]], i32 [[USERDATA5]], i64 8
-; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <20 x i32> [[TMP72]], i32 [[PAD6]], i64 9
-; CHECK-NEXT:    [[TMP74:%.*]] = insertelement <20 x i32> [[TMP73]], i32 [[PAD7]], i64 10
-; CHECK-NEXT:    [[TMP75:%.*]] = insertelement <20 x i32> [[TMP74]], i32 [[PAD8]], i64 11
-; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <20 x i32> [[TMP75]], i32 [[PAD9]], i64 12
-; CHECK-NEXT:    [[TMP77:%.*]] = insertelement <20 x i32> [[TMP76]], i32 [[PAD10]], i64 13
-; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <20 x i32> [[TMP77]], i32 [[PAD11]], i64 14
-; CHECK-NEXT:    [[TMP79:%.*]] = insertelement <20 x i32> [[TMP78]], i32 [[SPILLTABLE]], i64 15
-; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <20 x i32> [[TMP79]], i32 [[TMP61]], i64 16
-; CHECK-NEXT:    [[TMP81:%.*]] = insertelement <20 x i32> [[TMP80]], i32 [[TMP62]], i64 17
-; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <20 x i32> [[TMP81]], i32 [[TMP63]], i64 18
-; CHECK-NEXT:    [[TMP83:%.*]] = insertelement <20 x i32> [[TMP82]], i32 [[MULTIDISPATCHINFO]], i64 19
-; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32i32s(ptr inreg [[TMP56]], i32 inreg [[TMP51]], <20 x i32> inreg [[TMP83]], { <3 x i32>, i32, i32 } [[TMP28]], i32 0)
-; CHECK-NEXT:    unreachable
-; CHECK:       ret.block:
-; CHECK-NEXT:    ret void
-;
-entryresume.0:
-  %csp = alloca i32, align 4
-  store i32 %cspInit, ptr %csp, align 4
-  %2 = load i32, ptr %csp, align 4
-  %3 = add i32 %2, -96
-  %4 = add i32 %3, 16
-  %5 = inttoptr i32 %4 to ptr addrspace(5)
-  %6 = getelementptr i8, ptr addrspace(5) %5, i32 0
-  %.reload6 = load i32, ptr addrspace(5) %6, align 4
-  %7 = add i32 %3, 8
-  %8 = inttoptr i32 %7 to ptr addrspace(5)
-  %9 = getelementptr i8, ptr addrspace(5) %8, i32 0
-  %.reload3 = load ptr addrspace(7), ptr addrspace(5) %9, align 32
-  %10 = inttoptr i32 %3 to ptr addrspace(5)
-  %11 = getelementptr i8, ptr addrspace(5) %10, i32 0
-  %.reload = load ptr addrspace(7), ptr addrspace(5) %11, align 32
-  %dummy.udata = call i32 @lgc.load.user.data__i32(i32 20)
-  %12 = mul i32 %dummy.udata, 24
-  %13 = add i32 %3, %12
-  %14 = inttoptr i32 %13 to ptr addrspace(5)
-  %15 = getelementptr i8, ptr addrspace(5) %14, i32 0
-  %dummy.reload = load ptr addrspace(7), ptr addrspace(5) %15, align 32
-  %16 = load volatile i32, ptr addrspace(7) %.reload3, align 4
-  %17 = icmp eq i32 %.reload6, %16
-  %18 = zext i1 %17 to i32
-  store i32 %18, ptr addrspace(7) %.reload, align 4
-  ret void
-}
-
-declare i32 @lgc.load.user.data__i32(i32) #1
-
-declare noundef i64 @llvm.amdgcn.s.getpc() #2
-
-declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>) #1
-
-declare ptr addrspace(32) @lgc.cps.alloc(i32) #3
-
-declare i32 @lgc.cps.as.continuation.reference(...) #4
-
-declare void @lgc.cps.jump(...) #5
-
-declare ptr addrspace(32) @lgc.cps.peek(i32) #6
-
-declare void @lgc.cps.complete()
-
-declare !continuation !18 { ptr, ptr } @continuation.prototype._rgen_1(ptr, i1)
-
-declare ptr @continuation.malloc(i32)
-
-declare void @continuation.free(ptr)
-
-declare token @llvm.coro.id.retcon(i32, i32, ptr, ptr, ptr, ptr) #7
-
-declare ptr @llvm.coro.begin(token, ptr writeonly) #7
-
-declare !continuation !21 { ptr, ptr } @continuation.prototype._rgen_1.resume.0(ptr, i1)
-
-attributes #0 = { alwaysinline nounwind "target-features"=",+wavefrontsize32" }
-attributes #1 = { nounwind willreturn memory(none) }
-attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-attributes #3 = { nounwind willreturn memory(inaccessiblemem: readwrite) }
-attributes #4 = { nounwind willreturn }
-attributes #5 = { noreturn }
-attributes #6 = { nounwind willreturn memory(inaccessiblemem: read) }
-attributes #7 = { nounwind }
-
-!llpc.compute.mode = !{!0}
-!lgc.client = !{!1}
-!lgc.options = !{!2}
-!lgc.options.CS = !{!3}
-!lgc.user.data.nodes = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13}
-!amdgpu.pal.metadata.msgpack = !{!14}
-!continuation.stackAddrspace = !{!15}
-
-!0 = !{i32 8, i32 4, i32 1}
-!1 = !{!"Vulkan"}
-!2 = !{i32 262875531, i32 502344192, i32 854861601, i32 -1595331954, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16777216, i32 0, i32 0, i32 2}
-!3 = !{i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 192, i32 0, i32 0, i32 32, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
-!4 = !{!"DescriptorTableVaPtr", i32 7, i32 0, i32 0, i32 1, i32 4}
-!5 = !{!"DescriptorBuffer", i32 6, i32 0, i32 0, i32 4, i64 0, i32 0, i32 4}
-!6 = !{!"DescriptorBuffer", i32 6, i32 0, i32 4, i32 4, i64 0, i32 1, i32 4}
-!7 = !{!"DescriptorBuffer", i32 6, i32 0, i32 8, i32 4, i64 0, i32 2, i32 4}
-!8 = !{!"DescriptorBuffer", i32 6, i32 0, i32 12, i32 4, i64 0, i32 3, i32 4}
-!9 = !{!"StreamOutTableVaPtr", i32 11, i32 0, i32 1, i32 1, i32 0}
-!10 = !{!"DescriptorTableVaPtr", i32 7, i32 0, i32 5, i32 1, i32 3}
-!11 = !{!"DescriptorBufferCompact", i32 10, i32 0, i32 0, i32 2, i64 93, i32 17, i32 2}
-!12 = !{!"DescriptorBuffer", i32 6, i32 0, i32 2, i32 4, i64 93, i32 0, i32 4}
-!13 = !{!"DescriptorBuffer", i32 6, i32 0, i32 6, i32 4, i64 93, i32 1, i32 4}
-!14 = !{!"\82\B0amdpal.pipelines\91\83\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\C4jyX\05\E6M\0F\CF\03b\DD\05\C5\B6\DB\B9\AD.llpc_version\A467.0\AEamdpal.version\92\03\00"}
-!15 = !{i32 5}
-!16 = !{i32 5313}
-!17 = !{i32 7}
-!18 = !{ptr @_rgen_1}
-!19 = !{i32 1}
-!20 = !{i32 0}
-!21 = !{ptr @_rgen_1.resume.0}
-;.
-; CHECK: [[META16]] = !{i32 5313}
-; CHECK: [[META17]] = !{i32 7}
-; CHECK: [[META18]] = !{ptr @_rgen_1}
-; CHECK: [[META19]] = !{i32 1}
-; CHECK: [[META20]] = !{i32 0}
-; CHECK: [[META21]] = !{ptr @_rgen_1.resume.0}
-;.
diff --git a/lgc/test/Transforms/CpsLowering/cps-stack-lowering.lgc b/lgc/test/Transforms/CpsLowering/cps-stack-lowering.lgc
index ac26052e02..1f64393281 100644
--- a/lgc/test/Transforms/CpsLowering/cps-stack-lowering.lgc
+++ b/lgc/test/Transforms/CpsLowering/cps-stack-lowering.lgc
@@ -19,7 +19,7 @@ declare void @lgc.cps.complete()
 
 define void @test.0(i32 %cspInit) !lgc.cps !1 !lgc.shaderstage !2 !continuation !3 !continuation.state !4 {
 ; CHECK-LABEL: define {{[^@]+}}@test.0
-; CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[PAD0:%.*]], i32 inreg [[PAD1:%.*]], i32 inreg [[PAD2:%.*]], i32 inreg [[PAD3:%.*]], i32 inreg [[PAD4:%.*]], i32 inreg [[PAD5:%.*]], i32 inreg [[PAD6:%.*]], i32 inreg [[PAD7:%.*]], i32 inreg [[PAD8:%.*]], i32 inreg [[PAD9:%.*]], i32 inreg [[PAD10:%.*]], i32 inreg [[PAD11:%.*]], i32 inreg [[SPILLTABLE:%.*]], <3 x i32> inreg [[WORKGROUPID:%.*]], i32 inreg [[MULTIDISPATCHINFO:%.*]], <3 x i32> [[LOCALINVOCATIONID:%.*]], i32 [[VCR:%.*]], i32 [[CSPINIT:%.*]]) #[[ATTR1:[0-9]+]] align 64 !lgc.cps [[META3:![0-9]+]] !lgc.shaderstage [[META4:![0-9]+]] !continuation [[META5:![0-9]+]] !continuation.state [[META6:![0-9]+]] {
+; CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[PAD0:%.*]], i32 inreg [[PAD1:%.*]], i32 inreg [[PAD2:%.*]], i32 inreg [[PAD3:%.*]], i32 inreg [[PAD4:%.*]], i32 inreg [[PAD5:%.*]], i32 inreg [[PAD6:%.*]], i32 inreg [[PAD7:%.*]], i32 inreg [[PAD8:%.*]], i32 inreg [[PAD9:%.*]], i32 inreg [[PAD10:%.*]], i32 inreg [[PAD11:%.*]], i32 inreg [[SPILLTABLE:%.*]], <3 x i32> inreg [[WORKGROUPID:%.*]], i32 inreg [[MULTIDISPATCHINFO:%.*]], <3 x i32> [[LOCALINVOCATIONID:%.*]], i32 [[VCR:%.*]], i32 [[CSPINIT:%.*]]) #[[ATTR1:[0-9]+]] align 64 !lgc.cps [[META4:![0-9]+]] !lgc.shaderstage [[META5:![0-9]+]] !continuation [[META6:![0-9]+]] !continuation.state [[META7:![0-9]+]] {
 ; CHECK-NEXT:  AllocaSpillBB:
 ; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
@@ -47,67 +47,68 @@ define void @test.0(i32 %cspInit) !lgc.cps !1 !lgc.shaderstage !2 !continuation
 ; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[CSP]], align 4
 ; CHECK-NEXT:    br label [[TAIL_BLOCK:%.*]]
 ; CHECK:       tail.block:
-; CHECK-NEXT:    [[TMP18:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } poison, <3 x i32> [[LOCALINVOCATIONID]], 0
-; CHECK-NEXT:    [[TMP19:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP18]], i32 add (i32 ptrtoint (ptr @test.1 to i32), i32 1), 1
-; CHECK-NEXT:    [[TMP20:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP19]], i32 [[TMP17]], 2
-; CHECK-NEXT:    [[TMP21:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP20]], i32 poison, 3
-; CHECK-NEXT:    [[TMP22:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP21]], i32 [[TMP14]], 4
-; CHECK-NEXT:    [[TMP23:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP22]], i32 [[TMP11]], 5
-; CHECK-NEXT:    [[TMP24:%.*]] = extractvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP23]], 1
-; CHECK-NEXT:    [[TMP25:%.*]] = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 [[TMP24]], i32 [[VCR]])
-; CHECK-NEXT:    [[TMP26:%.*]] = and i32 [[TMP25]], 7
-; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0
-; CHECK-NEXT:    [[TMP28:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP27]])
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i32 [[TMP26]], 3
-; CHECK-NEXT:    [[TMP30:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP29]])
-; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne i32 [[TMP30]], 0
-; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP30]], i32 [[TMP28]]
-; CHECK-NEXT:    [[TMP33:%.*]] = icmp eq i32 [[TMP26]], 2
-; CHECK-NEXT:    [[TMP34:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP33]])
-; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
-; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 [[TMP34]], i32 [[TMP32]]
-; CHECK-NEXT:    [[TMP37:%.*]] = icmp eq i32 [[TMP26]], 1
-; CHECK-NEXT:    [[TMP38:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP37]])
-; CHECK-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
-; CHECK-NEXT:    [[TMP40:%.*]] = select i1 [[TMP39]], i32 [[TMP38]], i32 [[TMP36]]
-; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP40]], i1 true)
-; CHECK-NEXT:    [[TMP42:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 [[TMP41]])
-; CHECK-NEXT:    [[TMP43:%.*]] = icmp eq i32 [[TMP25]], [[TMP42]]
-; CHECK-NEXT:    [[TMP44:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP43]])
-; CHECK-NEXT:    [[TMP45:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP42]])
-; CHECK-NEXT:    [[TMP46:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP44]])
-; CHECK-NEXT:    [[TMP47:%.*]] = and i32 [[TMP45]], -64
-; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP47]], i64 0
-; CHECK-NEXT:    [[TMP49:%.*]] = bitcast <2 x i32> [[TMP48]] to i64
-; CHECK-NEXT:    [[TMP50:%.*]] = inttoptr i64 [[TMP49]] to ptr
-; CHECK-NEXT:    [[TMP51:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
-; CHECK-NEXT:    [[TMP52:%.*]] = bitcast i64 [[TMP51]] to <2 x i32>
-; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <2 x i32> [[TMP52]], i64 0
-; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <2 x i32> [[TMP52]], i64 1
-; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
-; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
-; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
-; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
-; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <20 x i32> [[TMP58]], i32 [[TMP53]], i64 1
-; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <20 x i32> [[TMP59]], i32 [[TMP54]], i64 2
-; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <20 x i32> [[TMP60]], i32 [[PAD0]], i64 3
-; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <20 x i32> [[TMP61]], i32 [[PAD1]], i64 4
-; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <20 x i32> [[TMP62]], i32 [[PAD2]], i64 5
-; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <20 x i32> [[TMP63]], i32 [[PAD3]], i64 6
-; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <20 x i32> [[TMP64]], i32 [[PAD4]], i64 7
-; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <20 x i32> [[TMP65]], i32 [[PAD5]], i64 8
-; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <20 x i32> [[TMP66]], i32 [[PAD6]], i64 9
-; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <20 x i32> [[TMP67]], i32 [[PAD7]], i64 10
-; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <20 x i32> [[TMP68]], i32 [[PAD8]], i64 11
-; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <20 x i32> [[TMP69]], i32 [[PAD9]], i64 12
-; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <20 x i32> [[TMP70]], i32 [[PAD10]], i64 13
-; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <20 x i32> [[TMP71]], i32 [[PAD11]], i64 14
-; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <20 x i32> [[TMP72]], i32 [[SPILLTABLE]], i64 15
-; CHECK-NEXT:    [[TMP74:%.*]] = insertelement <20 x i32> [[TMP73]], i32 [[TMP55]], i64 16
-; CHECK-NEXT:    [[TMP75:%.*]] = insertelement <20 x i32> [[TMP74]], i32 [[TMP56]], i64 17
-; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <20 x i32> [[TMP75]], i32 [[TMP57]], i64 18
-; CHECK-NEXT:    [[TMP77:%.*]] = insertelement <20 x i32> [[TMP76]], i32 [[MULTIDISPATCHINFO]], i64 19
-; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, i32, i32, i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32i32i32i32i32s(ptr inreg [[TMP50]], i32 inreg [[TMP46]], <20 x i32> inreg [[TMP77]], { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP23]], i32 0)
+; CHECK-NEXT:    [[TMP18:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } poison, <3 x i32> [[LOCALINVOCATIONID]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP18]], i32 add (i32 ptrtoint (ptr @test.1 to i32), i32 1), 1
+; CHECK-NEXT:    [[TMP20:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP19]], i32 [[TMP17]], 2
+; CHECK-NEXT:    [[TMP21:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP20]], i32 poison, 3
+; CHECK-NEXT:    [[TMP22:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP21]], i32 poison, 4
+; CHECK-NEXT:    [[TMP23:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP22]], i32 [[TMP14]], 5
+; CHECK-NEXT:    [[TMP24:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP23]], i32 [[TMP11]], 6
+; CHECK-NEXT:    [[TMP25:%.*]] = extractvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP24]], 1
+; CHECK-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 [[TMP25]], i32 [[VCR]])
+; CHECK-NEXT:    [[TMP27:%.*]] = and i32 [[TMP26]], 7
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0
+; CHECK-NEXT:    [[TMP29:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP28]])
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[TMP27]], 3
+; CHECK-NEXT:    [[TMP31:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP30]])
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
+; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[TMP31]], i32 [[TMP29]]
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i32 [[TMP27]], 2
+; CHECK-NEXT:    [[TMP35:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP34]])
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne i32 [[TMP35]], 0
+; CHECK-NEXT:    [[TMP37:%.*]] = select i1 [[TMP36]], i32 [[TMP35]], i32 [[TMP33]]
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i32 [[TMP27]], 1
+; CHECK-NEXT:    [[TMP39:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP38]])
+; CHECK-NEXT:    [[TMP40:%.*]] = icmp ne i32 [[TMP39]], 0
+; CHECK-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP39]], i32 [[TMP37]]
+; CHECK-NEXT:    [[TMP42:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP41]], i1 true)
+; CHECK-NEXT:    [[TMP43:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 [[TMP42]])
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp eq i32 [[TMP26]], [[TMP43]]
+; CHECK-NEXT:    [[TMP45:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP44]])
+; CHECK-NEXT:    [[TMP46:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP43]])
+; CHECK-NEXT:    [[TMP47:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP45]])
+; CHECK-NEXT:    [[TMP48:%.*]] = and i32 [[TMP46]], -64
+; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP48]], i64 0
+; CHECK-NEXT:    [[TMP50:%.*]] = bitcast <2 x i32> [[TMP49]] to i64
+; CHECK-NEXT:    [[TMP51:%.*]] = inttoptr i64 [[TMP50]] to ptr
+; CHECK-NEXT:    [[TMP52:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
+; CHECK-NEXT:    [[TMP53:%.*]] = bitcast i64 [[TMP52]] to <2 x i32>
+; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <2 x i32> [[TMP53]], i64 0
+; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <2 x i32> [[TMP53]], i64 1
+; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
+; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
+; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
+; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
+; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <20 x i32> [[TMP59]], i32 [[TMP54]], i64 1
+; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <20 x i32> [[TMP60]], i32 [[TMP55]], i64 2
+; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <20 x i32> [[TMP61]], i32 [[PAD0]], i64 3
+; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <20 x i32> [[TMP62]], i32 [[PAD1]], i64 4
+; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <20 x i32> [[TMP63]], i32 [[PAD2]], i64 5
+; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <20 x i32> [[TMP64]], i32 [[PAD3]], i64 6
+; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <20 x i32> [[TMP65]], i32 [[PAD4]], i64 7
+; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <20 x i32> [[TMP66]], i32 [[PAD5]], i64 8
+; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <20 x i32> [[TMP67]], i32 [[PAD6]], i64 9
+; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <20 x i32> [[TMP68]], i32 [[PAD7]], i64 10
+; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <20 x i32> [[TMP69]], i32 [[PAD8]], i64 11
+; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <20 x i32> [[TMP70]], i32 [[PAD9]], i64 12
+; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <20 x i32> [[TMP71]], i32 [[PAD10]], i64 13
+; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <20 x i32> [[TMP72]], i32 [[PAD11]], i64 14
+; CHECK-NEXT:    [[TMP74:%.*]] = insertelement <20 x i32> [[TMP73]], i32 [[SPILLTABLE]], i64 15
+; CHECK-NEXT:    [[TMP75:%.*]] = insertelement <20 x i32> [[TMP74]], i32 [[TMP56]], i64 16
+; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <20 x i32> [[TMP75]], i32 [[TMP57]], i64 17
+; CHECK-NEXT:    [[TMP77:%.*]] = insertelement <20 x i32> [[TMP76]], i32 [[TMP58]], i64 18
+; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <20 x i32> [[TMP77]], i32 [[MULTIDISPATCHINFO]], i64 19
+; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, i32, i32, i32, i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32i32i32i32i32i32s(ptr inreg [[TMP51]], i32 inreg [[TMP47]], <20 x i32> inreg [[TMP78]], { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP24]], i32 0)
 ; CHECK-NEXT:    unreachable
 ;
 AllocaSpillBB:
@@ -129,13 +130,13 @@ AllocaSpillBB:
   store i8 99, ptr addrspace(5) %9, align 1
   %cr = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
   %10 = load i32, ptr %csp, align 4
-  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 %10, i32 poison, i32 %7, i32 %4)
+  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 %10, i32 poison, i32 poison, i32 %7, i32 %4)
   unreachable
 }
 
 define void @test.1(i32 %cspInit, i32 %p2, i32 %q1) !lgc.cps !1 !lgc.shaderstage !2 !continuation !5 !continuation.state !4 {
 ; CHECK-LABEL: define {{[^@]+}}@test.1
-; CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[PAD0:%.*]], i32 inreg [[PAD1:%.*]], i32 inreg [[PAD2:%.*]], i32 inreg [[PAD3:%.*]], i32 inreg [[PAD4:%.*]], i32 inreg [[PAD5:%.*]], i32 inreg [[PAD6:%.*]], i32 inreg [[PAD7:%.*]], i32 inreg [[PAD8:%.*]], i32 inreg [[PAD9:%.*]], i32 inreg [[PAD10:%.*]], i32 inreg [[PAD11:%.*]], i32 inreg [[SPILLTABLE:%.*]], <3 x i32> inreg [[WORKGROUPID:%.*]], i32 inreg [[MULTIDISPATCHINFO:%.*]], <3 x i32> [[LOCALINVOCATIONID:%.*]], i32 [[VCR:%.*]], i32 [[CSPINIT:%.*]], i32 [[P2:%.*]], i32 [[Q1:%.*]]) #[[ATTR1]] align 64 !lgc.cps [[META3]] !lgc.shaderstage [[META4]] !continuation [[META7:![0-9]+]] !continuation.state [[META6]] {
+; CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[PAD0:%.*]], i32 inreg [[PAD1:%.*]], i32 inreg [[PAD2:%.*]], i32 inreg [[PAD3:%.*]], i32 inreg [[PAD4:%.*]], i32 inreg [[PAD5:%.*]], i32 inreg [[PAD6:%.*]], i32 inreg [[PAD7:%.*]], i32 inreg [[PAD8:%.*]], i32 inreg [[PAD9:%.*]], i32 inreg [[PAD10:%.*]], i32 inreg [[PAD11:%.*]], i32 inreg [[SPILLTABLE:%.*]], <3 x i32> inreg [[WORKGROUPID:%.*]], i32 inreg [[MULTIDISPATCHINFO:%.*]], <3 x i32> [[LOCALINVOCATIONID:%.*]], i32 [[VCR:%.*]], i32 [[CSPINIT:%.*]], i32 [[P2:%.*]], i32 [[Q1:%.*]]) #[[ATTR1]] align 64 !lgc.cps [[META4]] !lgc.shaderstage [[META5]] !continuation [[META8:![0-9]+]] !continuation.state [[META7]] {
 ; CHECK-NEXT:  AllocaSpillBB:
 ; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
@@ -155,65 +156,66 @@ define void @test.1(i32 %cspInit, i32 %p2, i32 %q1) !lgc.cps !1 !lgc.shaderstage
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[CSP]], align 4
 ; CHECK-NEXT:    br label [[TAIL_BLOCK:%.*]]
 ; CHECK:       tail.block:
-; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <3 x i32>, i32, i32, i32 } poison, <3 x i32> [[LOCALINVOCATIONID]], 0
-; CHECK-NEXT:    [[TMP13:%.*]] = insertvalue { <3 x i32>, i32, i32, i32 } [[TMP12]], i32 add (i32 ptrtoint (ptr @test.2 to i32), i32 1), 1
-; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue { <3 x i32>, i32, i32, i32 } [[TMP13]], i32 [[TMP11]], 2
-; CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <3 x i32>, i32, i32, i32 } [[TMP14]], i32 poison, 3
-; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <3 x i32>, i32, i32, i32 } [[TMP15]], 1
-; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 [[TMP16]], i32 [[VCR]])
-; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 7
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
-; CHECK-NEXT:    [[TMP20:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP19]])
-; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP18]], 3
-; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP21]])
-; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
-; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP22]], i32 [[TMP20]]
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i32 [[TMP18]], 2
-; CHECK-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP25]])
-; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0
-; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 [[TMP26]], i32 [[TMP24]]
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i32 [[TMP18]], 1
-; CHECK-NEXT:    [[TMP30:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP29]])
-; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne i32 [[TMP30]], 0
-; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP30]], i32 [[TMP28]]
-; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP32]], i1 true)
-; CHECK-NEXT:    [[TMP34:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP17]], i32 [[TMP33]])
-; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[TMP17]], [[TMP34]]
-; CHECK-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP35]])
-; CHECK-NEXT:    [[TMP37:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP34]])
-; CHECK-NEXT:    [[TMP38:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP36]])
-; CHECK-NEXT:    [[TMP39:%.*]] = and i32 [[TMP37]], -64
-; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP39]], i64 0
-; CHECK-NEXT:    [[TMP41:%.*]] = bitcast <2 x i32> [[TMP40]] to i64
-; CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-; CHECK-NEXT:    [[TMP43:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
-; CHECK-NEXT:    [[TMP44:%.*]] = bitcast i64 [[TMP43]] to <2 x i32>
-; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <2 x i32> [[TMP44]], i64 0
-; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <2 x i32> [[TMP44]], i64 1
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
-; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
-; CHECK-NEXT:    [[TMP50:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
-; CHECK-NEXT:    [[TMP51:%.*]] = insertelement <20 x i32> [[TMP50]], i32 [[TMP45]], i64 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <20 x i32> [[TMP51]], i32 [[TMP46]], i64 2
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <20 x i32> [[TMP52]], i32 [[PAD0]], i64 3
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <20 x i32> [[TMP53]], i32 [[PAD1]], i64 4
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <20 x i32> [[TMP54]], i32 [[PAD2]], i64 5
-; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <20 x i32> [[TMP55]], i32 [[PAD3]], i64 6
-; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <20 x i32> [[TMP56]], i32 [[PAD4]], i64 7
-; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <20 x i32> [[TMP57]], i32 [[PAD5]], i64 8
-; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <20 x i32> [[TMP58]], i32 [[PAD6]], i64 9
-; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <20 x i32> [[TMP59]], i32 [[PAD7]], i64 10
-; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <20 x i32> [[TMP60]], i32 [[PAD8]], i64 11
-; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <20 x i32> [[TMP61]], i32 [[PAD9]], i64 12
-; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <20 x i32> [[TMP62]], i32 [[PAD10]], i64 13
-; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <20 x i32> [[TMP63]], i32 [[PAD11]], i64 14
-; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <20 x i32> [[TMP64]], i32 [[SPILLTABLE]], i64 15
-; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <20 x i32> [[TMP65]], i32 [[TMP47]], i64 16
-; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <20 x i32> [[TMP66]], i32 [[TMP48]], i64 17
-; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <20 x i32> [[TMP67]], i32 [[TMP49]], i64 18
-; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <20 x i32> [[TMP68]], i32 [[MULTIDISPATCHINFO]], i64 19
-; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32i32i32s(ptr inreg [[TMP42]], i32 inreg [[TMP38]], <20 x i32> inreg [[TMP69]], { <3 x i32>, i32, i32, i32 } [[TMP15]], i32 0)
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32 } poison, <3 x i32> [[LOCALINVOCATIONID]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32 } [[TMP12]], i32 add (i32 ptrtoint (ptr @test.2 to i32), i32 1), 1
+; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32 } [[TMP13]], i32 [[TMP11]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32 } [[TMP14]], i32 poison, 3
+; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32 } [[TMP15]], i32 poison, 4
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <3 x i32>, i32, i32, i32, i32 } [[TMP16]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 [[TMP17]], i32 [[VCR]])
+; CHECK-NEXT:    [[TMP19:%.*]] = and i32 [[TMP18]], 7
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP20]])
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[TMP19]], 3
+; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP22]])
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
+; CHECK-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 [[TMP23]], i32 [[TMP21]]
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[TMP19]], 2
+; CHECK-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP26]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0
+; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP27]], i32 [[TMP25]]
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[TMP19]], 1
+; CHECK-NEXT:    [[TMP31:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP30]])
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
+; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[TMP31]], i32 [[TMP29]]
+; CHECK-NEXT:    [[TMP34:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP33]], i1 true)
+; CHECK-NEXT:    [[TMP35:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP18]], i32 [[TMP34]])
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP18]], [[TMP35]]
+; CHECK-NEXT:    [[TMP37:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP36]])
+; CHECK-NEXT:    [[TMP38:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP35]])
+; CHECK-NEXT:    [[TMP39:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP37]])
+; CHECK-NEXT:    [[TMP40:%.*]] = and i32 [[TMP38]], -64
+; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP40]], i64 0
+; CHECK-NEXT:    [[TMP42:%.*]] = bitcast <2 x i32> [[TMP41]] to i64
+; CHECK-NEXT:    [[TMP43:%.*]] = inttoptr i64 [[TMP42]] to ptr
+; CHECK-NEXT:    [[TMP44:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
+; CHECK-NEXT:    [[TMP45:%.*]] = bitcast i64 [[TMP44]] to <2 x i32>
+; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <2 x i32> [[TMP45]], i64 0
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <2 x i32> [[TMP45]], i64 1
+; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
+; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
+; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
+; CHECK-NEXT:    [[TMP51:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <20 x i32> [[TMP51]], i32 [[TMP46]], i64 1
+; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <20 x i32> [[TMP52]], i32 [[TMP47]], i64 2
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <20 x i32> [[TMP53]], i32 [[PAD0]], i64 3
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <20 x i32> [[TMP54]], i32 [[PAD1]], i64 4
+; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <20 x i32> [[TMP55]], i32 [[PAD2]], i64 5
+; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <20 x i32> [[TMP56]], i32 [[PAD3]], i64 6
+; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <20 x i32> [[TMP57]], i32 [[PAD4]], i64 7
+; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <20 x i32> [[TMP58]], i32 [[PAD5]], i64 8
+; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <20 x i32> [[TMP59]], i32 [[PAD6]], i64 9
+; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <20 x i32> [[TMP60]], i32 [[PAD7]], i64 10
+; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <20 x i32> [[TMP61]], i32 [[PAD8]], i64 11
+; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <20 x i32> [[TMP62]], i32 [[PAD9]], i64 12
+; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <20 x i32> [[TMP63]], i32 [[PAD10]], i64 13
+; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <20 x i32> [[TMP64]], i32 [[PAD11]], i64 14
+; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <20 x i32> [[TMP65]], i32 [[SPILLTABLE]], i64 15
+; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <20 x i32> [[TMP66]], i32 [[TMP48]], i64 16
+; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <20 x i32> [[TMP67]], i32 [[TMP49]], i64 17
+; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <20 x i32> [[TMP68]], i32 [[TMP50]], i64 18
+; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <20 x i32> [[TMP69]], i32 [[MULTIDISPATCHINFO]], i64 19
+; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, i32, i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32i32i32i32s(ptr inreg [[TMP43]], i32 inreg [[TMP39]], <20 x i32> inreg [[TMP70]], { <3 x i32>, i32, i32, i32, i32 } [[TMP16]], i32 0)
 ; CHECK-NEXT:    unreachable
 ;
 AllocaSpillBB:
@@ -227,13 +229,13 @@ AllocaSpillBB:
   %n99 = load i8, ptr addrspace(5) %3, align 1
   %cr = call i32 @lgc.cps.as.continuation.reference(ptr @test.2)
   %4 = load i32, ptr %csp, align 4
-  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 %4, i32 poison)
+  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 %4, i32 poison, i32 poison)
   unreachable
 }
 
 define void @test.2(i32 %cspInit) !lgc.cps !1 !lgc.shaderstage !2 !continuation !6 !continuation.state !4 {
 ; CHECK-LABEL: define {{[^@]+}}@test.2
-; CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[PAD0:%.*]], i32 inreg [[PAD1:%.*]], i32 inreg [[PAD2:%.*]], i32 inreg [[PAD3:%.*]], i32 inreg [[PAD4:%.*]], i32 inreg [[PAD5:%.*]], i32 inreg [[PAD6:%.*]], i32 inreg [[PAD7:%.*]], i32 inreg [[PAD8:%.*]], i32 inreg [[PAD9:%.*]], i32 inreg [[PAD10:%.*]], i32 inreg [[PAD11:%.*]], i32 inreg [[SPILLTABLE:%.*]], <3 x i32> inreg [[WORKGROUPID:%.*]], i32 inreg [[MULTIDISPATCHINFO:%.*]], <3 x i32> [[LOCALINVOCATIONID:%.*]], i32 [[VCR:%.*]], i32 [[CSPINIT:%.*]]) #[[ATTR1]] align 64 !lgc.cps [[META3]] !lgc.shaderstage [[META4]] !continuation [[META8:![0-9]+]] !continuation.state [[META6]] {
+; CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[PAD0:%.*]], i32 inreg [[PAD1:%.*]], i32 inreg [[PAD2:%.*]], i32 inreg [[PAD3:%.*]], i32 inreg [[PAD4:%.*]], i32 inreg [[PAD5:%.*]], i32 inreg [[PAD6:%.*]], i32 inreg [[PAD7:%.*]], i32 inreg [[PAD8:%.*]], i32 inreg [[PAD9:%.*]], i32 inreg [[PAD10:%.*]], i32 inreg [[PAD11:%.*]], i32 inreg [[SPILLTABLE:%.*]], <3 x i32> inreg [[WORKGROUPID:%.*]], i32 inreg [[MULTIDISPATCHINFO:%.*]], <3 x i32> [[LOCALINVOCATIONID:%.*]], i32 [[VCR:%.*]], i32 [[CSPINIT:%.*]]) #[[ATTR1]] align 64 !lgc.cps [[META4]] !lgc.shaderstage [[META5]] !continuation [[META9:![0-9]+]] !continuation.state [[META7]] {
 ; CHECK-NEXT:  AllocaSpillBB:
 ; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
@@ -254,67 +256,68 @@ define void @test.2(i32 %cspInit) !lgc.cps !1 !lgc.shaderstage !2 !continuation
 ; CHECK-NEXT:    store i32 [[TMP12]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    br label [[TAIL_BLOCK:%.*]]
 ; CHECK:       tail.block:
-; CHECK-NEXT:    [[TMP13:%.*]] = insertvalue { <3 x i32>, i32, i32 } poison, <3 x i32> [[LOCALINVOCATIONID]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue { <3 x i32>, i32, i32 } [[TMP13]], i32 0, 1
-; CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <3 x i32>, i32, i32 } [[TMP14]], i32 poison, 2
-; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <3 x i32>, i32, i32 } [[TMP15]], 1
-; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 [[TMP16]], i32 [[VCR]])
-; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 7
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
-; CHECK-NEXT:    [[TMP20:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP19]])
-; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP18]], 3
-; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP21]])
-; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
-; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP22]], i32 [[TMP20]]
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i32 [[TMP18]], 2
-; CHECK-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP25]])
-; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0
-; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 [[TMP26]], i32 [[TMP24]]
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i32 [[TMP18]], 1
-; CHECK-NEXT:    [[TMP30:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP29]])
-; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne i32 [[TMP30]], 0
-; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP30]], i32 [[TMP28]]
-; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP32]], i1 true)
-; CHECK-NEXT:    [[TMP34:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP17]], i32 [[TMP33]])
-; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[TMP17]], [[TMP34]]
-; CHECK-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP35]])
-; CHECK-NEXT:    [[TMP37:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP34]])
-; CHECK-NEXT:    [[TMP38:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP36]])
-; CHECK-NEXT:    [[TMP39:%.*]] = icmp eq i32 [[TMP37]], 0
-; CHECK-NEXT:    br i1 [[TMP39]], label [[RET_BLOCK:%.*]], label [[CHAIN_BLOCK:%.*]]
+; CHECK-NEXT:    [[TMP13:%.*]] = insertvalue { <3 x i32>, i32, i32, i32 } poison, <3 x i32> [[LOCALINVOCATIONID]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue { <3 x i32>, i32, i32, i32 } [[TMP13]], i32 0, 1
+; CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <3 x i32>, i32, i32, i32 } [[TMP14]], i32 poison, 2
+; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue { <3 x i32>, i32, i32, i32 } [[TMP15]], i32 poison, 3
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <3 x i32>, i32, i32, i32 } [[TMP16]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 [[TMP17]], i32 [[VCR]])
+; CHECK-NEXT:    [[TMP19:%.*]] = and i32 [[TMP18]], 7
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP20]])
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[TMP19]], 3
+; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP22]])
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
+; CHECK-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 [[TMP23]], i32 [[TMP21]]
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[TMP19]], 2
+; CHECK-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP26]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0
+; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP27]], i32 [[TMP25]]
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[TMP19]], 1
+; CHECK-NEXT:    [[TMP31:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP30]])
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
+; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[TMP31]], i32 [[TMP29]]
+; CHECK-NEXT:    [[TMP34:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP33]], i1 true)
+; CHECK-NEXT:    [[TMP35:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP18]], i32 [[TMP34]])
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP18]], [[TMP35]]
+; CHECK-NEXT:    [[TMP37:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP36]])
+; CHECK-NEXT:    [[TMP38:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP35]])
+; CHECK-NEXT:    [[TMP39:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP37]])
+; CHECK-NEXT:    [[TMP40:%.*]] = icmp eq i32 [[TMP38]], 0
+; CHECK-NEXT:    br i1 [[TMP40]], label [[RET_BLOCK:%.*]], label [[CHAIN_BLOCK:%.*]]
 ; CHECK:       chain.block:
-; CHECK-NEXT:    [[TMP40:%.*]] = and i32 [[TMP37]], -64
-; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP40]], i64 0
-; CHECK-NEXT:    [[TMP42:%.*]] = bitcast <2 x i32> [[TMP41]] to i64
-; CHECK-NEXT:    [[TMP43:%.*]] = inttoptr i64 [[TMP42]] to ptr
-; CHECK-NEXT:    [[TMP44:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
-; CHECK-NEXT:    [[TMP45:%.*]] = bitcast i64 [[TMP44]] to <2 x i32>
-; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <2 x i32> [[TMP45]], i64 0
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <2 x i32> [[TMP45]], i64 1
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
-; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
-; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
-; CHECK-NEXT:    [[TMP51:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <20 x i32> [[TMP51]], i32 [[TMP46]], i64 1
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <20 x i32> [[TMP52]], i32 [[TMP47]], i64 2
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <20 x i32> [[TMP53]], i32 [[PAD0]], i64 3
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <20 x i32> [[TMP54]], i32 [[PAD1]], i64 4
-; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <20 x i32> [[TMP55]], i32 [[PAD2]], i64 5
-; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <20 x i32> [[TMP56]], i32 [[PAD3]], i64 6
-; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <20 x i32> [[TMP57]], i32 [[PAD4]], i64 7
-; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <20 x i32> [[TMP58]], i32 [[PAD5]], i64 8
-; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <20 x i32> [[TMP59]], i32 [[PAD6]], i64 9
-; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <20 x i32> [[TMP60]], i32 [[PAD7]], i64 10
-; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <20 x i32> [[TMP61]], i32 [[PAD8]], i64 11
-; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <20 x i32> [[TMP62]], i32 [[PAD9]], i64 12
-; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <20 x i32> [[TMP63]], i32 [[PAD10]], i64 13
-; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <20 x i32> [[TMP64]], i32 [[PAD11]], i64 14
-; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <20 x i32> [[TMP65]], i32 [[SPILLTABLE]], i64 15
-; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <20 x i32> [[TMP66]], i32 [[TMP48]], i64 16
-; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <20 x i32> [[TMP67]], i32 [[TMP49]], i64 17
-; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <20 x i32> [[TMP68]], i32 [[TMP50]], i64 18
-; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <20 x i32> [[TMP69]], i32 [[MULTIDISPATCHINFO]], i64 19
-; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32i32s(ptr inreg [[TMP43]], i32 inreg [[TMP38]], <20 x i32> inreg [[TMP70]], { <3 x i32>, i32, i32 } [[TMP15]], i32 0)
+; CHECK-NEXT:    [[TMP41:%.*]] = and i32 [[TMP38]], -64
+; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP41]], i64 0
+; CHECK-NEXT:    [[TMP43:%.*]] = bitcast <2 x i32> [[TMP42]] to i64
+; CHECK-NEXT:    [[TMP44:%.*]] = inttoptr i64 [[TMP43]] to ptr
+; CHECK-NEXT:    [[TMP45:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
+; CHECK-NEXT:    [[TMP46:%.*]] = bitcast i64 [[TMP45]] to <2 x i32>
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <2 x i32> [[TMP46]], i64 0
+; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x i32> [[TMP46]], i64 1
+; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
+; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
+; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
+; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <20 x i32> [[TMP52]], i32 [[TMP47]], i64 1
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <20 x i32> [[TMP53]], i32 [[TMP48]], i64 2
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <20 x i32> [[TMP54]], i32 [[PAD0]], i64 3
+; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <20 x i32> [[TMP55]], i32 [[PAD1]], i64 4
+; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <20 x i32> [[TMP56]], i32 [[PAD2]], i64 5
+; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <20 x i32> [[TMP57]], i32 [[PAD3]], i64 6
+; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <20 x i32> [[TMP58]], i32 [[PAD4]], i64 7
+; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <20 x i32> [[TMP59]], i32 [[PAD5]], i64 8
+; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <20 x i32> [[TMP60]], i32 [[PAD6]], i64 9
+; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <20 x i32> [[TMP61]], i32 [[PAD7]], i64 10
+; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <20 x i32> [[TMP62]], i32 [[PAD8]], i64 11
+; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <20 x i32> [[TMP63]], i32 [[PAD9]], i64 12
+; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <20 x i32> [[TMP64]], i32 [[PAD10]], i64 13
+; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <20 x i32> [[TMP65]], i32 [[PAD11]], i64 14
+; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <20 x i32> [[TMP66]], i32 [[SPILLTABLE]], i64 15
+; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <20 x i32> [[TMP67]], i32 [[TMP49]], i64 16
+; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <20 x i32> [[TMP68]], i32 [[TMP50]], i64 17
+; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <20 x i32> [[TMP69]], i32 [[TMP51]], i64 18
+; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <20 x i32> [[TMP70]], i32 [[MULTIDISPATCHINFO]], i64 19
+; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32i32i32s(ptr inreg [[TMP44]], i32 inreg [[TMP39]], <20 x i32> inreg [[TMP71]], { <3 x i32>, i32, i32, i32 } [[TMP16]], i32 0)
 ; CHECK-NEXT:    unreachable
 ; CHECK:       ret.block:
 ; CHECK-NEXT:    ret void
@@ -335,7 +338,7 @@ AllocaSpillBB:
 
 define void @test.gep(i32 %cspInit) !lgc.cps !1 !lgc.shaderstage !2 !continuation !7 !continuation.state !4 {
 ; CHECK-LABEL: define {{[^@]+}}@test.gep
-; CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[PAD0:%.*]], i32 inreg [[PAD1:%.*]], i32 inreg [[PAD2:%.*]], i32 inreg [[PAD3:%.*]], i32 inreg [[PAD4:%.*]], i32 inreg [[PAD5:%.*]], i32 inreg [[PAD6:%.*]], i32 inreg [[PAD7:%.*]], i32 inreg [[PAD8:%.*]], i32 inreg [[PAD9:%.*]], i32 inreg [[PAD10:%.*]], i32 inreg [[PAD11:%.*]], i32 inreg [[SPILLTABLE:%.*]], <3 x i32> inreg [[WORKGROUPID:%.*]], i32 inreg [[MULTIDISPATCHINFO:%.*]], <3 x i32> [[LOCALINVOCATIONID:%.*]], i32 [[VCR:%.*]], i32 [[CSPINIT:%.*]]) #[[ATTR1]] align 64 !lgc.cps [[META3]] !lgc.shaderstage [[META4]] !continuation [[META9:![0-9]+]] !continuation.state [[META6]] {
+; CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[PAD0:%.*]], i32 inreg [[PAD1:%.*]], i32 inreg [[PAD2:%.*]], i32 inreg [[PAD3:%.*]], i32 inreg [[PAD4:%.*]], i32 inreg [[PAD5:%.*]], i32 inreg [[PAD6:%.*]], i32 inreg [[PAD7:%.*]], i32 inreg [[PAD8:%.*]], i32 inreg [[PAD9:%.*]], i32 inreg [[PAD10:%.*]], i32 inreg [[PAD11:%.*]], i32 inreg [[SPILLTABLE:%.*]], <3 x i32> inreg [[WORKGROUPID:%.*]], i32 inreg [[MULTIDISPATCHINFO:%.*]], <3 x i32> [[LOCALINVOCATIONID:%.*]], i32 [[VCR:%.*]], i32 [[CSPINIT:%.*]]) #[[ATTR1]] align 64 !lgc.cps [[META4]] !lgc.shaderstage [[META5]] !continuation [[META10:![0-9]+]] !continuation.state [[META7]] {
 ; CHECK-NEXT:  AllocaSpillBB:
 ; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
@@ -377,67 +380,68 @@ define void @test.gep(i32 %cspInit) !lgc.cps !1 !lgc.shaderstage !2 !continuatio
 ; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[CSP]], align 4
 ; CHECK-NEXT:    br label [[TAIL_BLOCK:%.*]]
 ; CHECK:       tail.block:
-; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } poison, <3 x i32> [[LOCALINVOCATIONID]], 0
-; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP28]], i32 add (i32 ptrtoint (ptr @test.1 to i32), i32 1), 1
-; CHECK-NEXT:    [[TMP30:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP29]], i32 [[TMP27]], 2
-; CHECK-NEXT:    [[TMP31:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP30]], i32 poison, 3
-; CHECK-NEXT:    [[TMP32:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP31]], i32 [[TMP24]], 4
-; CHECK-NEXT:    [[TMP33:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP32]], i32 [[TMP24]], 5
-; CHECK-NEXT:    [[TMP34:%.*]] = extractvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP33]], 1
-; CHECK-NEXT:    [[TMP35:%.*]] = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 [[TMP34]], i32 [[VCR]])
-; CHECK-NEXT:    [[TMP36:%.*]] = and i32 [[TMP35]], 7
-; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
-; CHECK-NEXT:    [[TMP38:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP37]])
-; CHECK-NEXT:    [[TMP39:%.*]] = icmp eq i32 [[TMP36]], 3
-; CHECK-NEXT:    [[TMP40:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP39]])
-; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0
-; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i32 [[TMP40]], i32 [[TMP38]]
-; CHECK-NEXT:    [[TMP43:%.*]] = icmp eq i32 [[TMP36]], 2
-; CHECK-NEXT:    [[TMP44:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP43]])
-; CHECK-NEXT:    [[TMP45:%.*]] = icmp ne i32 [[TMP44]], 0
-; CHECK-NEXT:    [[TMP46:%.*]] = select i1 [[TMP45]], i32 [[TMP44]], i32 [[TMP42]]
-; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq i32 [[TMP36]], 1
-; CHECK-NEXT:    [[TMP48:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP47]])
-; CHECK-NEXT:    [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0
-; CHECK-NEXT:    [[TMP50:%.*]] = select i1 [[TMP49]], i32 [[TMP48]], i32 [[TMP46]]
-; CHECK-NEXT:    [[TMP51:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP50]], i1 true)
-; CHECK-NEXT:    [[TMP52:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP35]], i32 [[TMP51]])
-; CHECK-NEXT:    [[TMP53:%.*]] = icmp eq i32 [[TMP35]], [[TMP52]]
-; CHECK-NEXT:    [[TMP54:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP53]])
-; CHECK-NEXT:    [[TMP55:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP52]])
-; CHECK-NEXT:    [[TMP56:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP54]])
-; CHECK-NEXT:    [[TMP57:%.*]] = and i32 [[TMP55]], -64
-; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP57]], i64 0
-; CHECK-NEXT:    [[TMP59:%.*]] = bitcast <2 x i32> [[TMP58]] to i64
-; CHECK-NEXT:    [[TMP60:%.*]] = inttoptr i64 [[TMP59]] to ptr
-; CHECK-NEXT:    [[TMP61:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
-; CHECK-NEXT:    [[TMP62:%.*]] = bitcast i64 [[TMP61]] to <2 x i32>
-; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <2 x i32> [[TMP62]], i64 0
-; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <2 x i32> [[TMP62]], i64 1
-; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
-; CHECK-NEXT:    [[TMP66:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
-; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
-; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
-; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <20 x i32> [[TMP68]], i32 [[TMP63]], i64 1
-; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <20 x i32> [[TMP69]], i32 [[TMP64]], i64 2
-; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <20 x i32> [[TMP70]], i32 [[PAD0]], i64 3
-; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <20 x i32> [[TMP71]], i32 [[PAD1]], i64 4
-; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <20 x i32> [[TMP72]], i32 [[PAD2]], i64 5
-; CHECK-NEXT:    [[TMP74:%.*]] = insertelement <20 x i32> [[TMP73]], i32 [[PAD3]], i64 6
-; CHECK-NEXT:    [[TMP75:%.*]] = insertelement <20 x i32> [[TMP74]], i32 [[PAD4]], i64 7
-; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <20 x i32> [[TMP75]], i32 [[PAD5]], i64 8
-; CHECK-NEXT:    [[TMP77:%.*]] = insertelement <20 x i32> [[TMP76]], i32 [[PAD6]], i64 9
-; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <20 x i32> [[TMP77]], i32 [[PAD7]], i64 10
-; CHECK-NEXT:    [[TMP79:%.*]] = insertelement <20 x i32> [[TMP78]], i32 [[PAD8]], i64 11
-; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <20 x i32> [[TMP79]], i32 [[PAD9]], i64 12
-; CHECK-NEXT:    [[TMP81:%.*]] = insertelement <20 x i32> [[TMP80]], i32 [[PAD10]], i64 13
-; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <20 x i32> [[TMP81]], i32 [[PAD11]], i64 14
-; CHECK-NEXT:    [[TMP83:%.*]] = insertelement <20 x i32> [[TMP82]], i32 [[SPILLTABLE]], i64 15
-; CHECK-NEXT:    [[TMP84:%.*]] = insertelement <20 x i32> [[TMP83]], i32 [[TMP65]], i64 16
-; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <20 x i32> [[TMP84]], i32 [[TMP66]], i64 17
-; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <20 x i32> [[TMP85]], i32 [[TMP67]], i64 18
-; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <20 x i32> [[TMP86]], i32 [[MULTIDISPATCHINFO]], i64 19
-; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, i32, i32, i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32i32i32i32i32s(ptr inreg [[TMP60]], i32 inreg [[TMP56]], <20 x i32> inreg [[TMP87]], { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP33]], i32 0)
+; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } poison, <3 x i32> [[LOCALINVOCATIONID]], 0
+; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP28]], i32 add (i32 ptrtoint (ptr @test.1 to i32), i32 1), 1
+; CHECK-NEXT:    [[TMP30:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP29]], i32 [[TMP27]], 2
+; CHECK-NEXT:    [[TMP31:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP30]], i32 poison, 3
+; CHECK-NEXT:    [[TMP32:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP31]], i32 poison, 4
+; CHECK-NEXT:    [[TMP33:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP32]], i32 [[TMP24]], 5
+; CHECK-NEXT:    [[TMP34:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP33]], i32 [[TMP24]], 6
+; CHECK-NEXT:    [[TMP35:%.*]] = extractvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP34]], 1
+; CHECK-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 [[TMP35]], i32 [[VCR]])
+; CHECK-NEXT:    [[TMP37:%.*]] = and i32 [[TMP36]], 7
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp ne i32 [[TMP37]], 0
+; CHECK-NEXT:    [[TMP39:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP38]])
+; CHECK-NEXT:    [[TMP40:%.*]] = icmp eq i32 [[TMP37]], 3
+; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP40]])
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne i32 [[TMP41]], 0
+; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP41]], i32 [[TMP39]]
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp eq i32 [[TMP37]], 2
+; CHECK-NEXT:    [[TMP45:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP44]])
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne i32 [[TMP45]], 0
+; CHECK-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP45]], i32 [[TMP43]]
+; CHECK-NEXT:    [[TMP48:%.*]] = icmp eq i32 [[TMP37]], 1
+; CHECK-NEXT:    [[TMP49:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP48]])
+; CHECK-NEXT:    [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0
+; CHECK-NEXT:    [[TMP51:%.*]] = select i1 [[TMP50]], i32 [[TMP49]], i32 [[TMP47]]
+; CHECK-NEXT:    [[TMP52:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP51]], i1 true)
+; CHECK-NEXT:    [[TMP53:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP36]], i32 [[TMP52]])
+; CHECK-NEXT:    [[TMP54:%.*]] = icmp eq i32 [[TMP36]], [[TMP53]]
+; CHECK-NEXT:    [[TMP55:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP54]])
+; CHECK-NEXT:    [[TMP56:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP53]])
+; CHECK-NEXT:    [[TMP57:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP55]])
+; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP56]], -64
+; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP58]], i64 0
+; CHECK-NEXT:    [[TMP60:%.*]] = bitcast <2 x i32> [[TMP59]] to i64
+; CHECK-NEXT:    [[TMP61:%.*]] = inttoptr i64 [[TMP60]] to ptr
+; CHECK-NEXT:    [[TMP62:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
+; CHECK-NEXT:    [[TMP63:%.*]] = bitcast i64 [[TMP62]] to <2 x i32>
+; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <2 x i32> [[TMP63]], i64 0
+; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <2 x i32> [[TMP63]], i64 1
+; CHECK-NEXT:    [[TMP66:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
+; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
+; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
+; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
+; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <20 x i32> [[TMP69]], i32 [[TMP64]], i64 1
+; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <20 x i32> [[TMP70]], i32 [[TMP65]], i64 2
+; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <20 x i32> [[TMP71]], i32 [[PAD0]], i64 3
+; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <20 x i32> [[TMP72]], i32 [[PAD1]], i64 4
+; CHECK-NEXT:    [[TMP74:%.*]] = insertelement <20 x i32> [[TMP73]], i32 [[PAD2]], i64 5
+; CHECK-NEXT:    [[TMP75:%.*]] = insertelement <20 x i32> [[TMP74]], i32 [[PAD3]], i64 6
+; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <20 x i32> [[TMP75]], i32 [[PAD4]], i64 7
+; CHECK-NEXT:    [[TMP77:%.*]] = insertelement <20 x i32> [[TMP76]], i32 [[PAD5]], i64 8
+; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <20 x i32> [[TMP77]], i32 [[PAD6]], i64 9
+; CHECK-NEXT:    [[TMP79:%.*]] = insertelement <20 x i32> [[TMP78]], i32 [[PAD7]], i64 10
+; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <20 x i32> [[TMP79]], i32 [[PAD8]], i64 11
+; CHECK-NEXT:    [[TMP81:%.*]] = insertelement <20 x i32> [[TMP80]], i32 [[PAD9]], i64 12
+; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <20 x i32> [[TMP81]], i32 [[PAD10]], i64 13
+; CHECK-NEXT:    [[TMP83:%.*]] = insertelement <20 x i32> [[TMP82]], i32 [[PAD11]], i64 14
+; CHECK-NEXT:    [[TMP84:%.*]] = insertelement <20 x i32> [[TMP83]], i32 [[SPILLTABLE]], i64 15
+; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <20 x i32> [[TMP84]], i32 [[TMP66]], i64 16
+; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <20 x i32> [[TMP85]], i32 [[TMP67]], i64 17
+; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <20 x i32> [[TMP86]], i32 [[TMP68]], i64 18
+; CHECK-NEXT:    [[TMP88:%.*]] = insertelement <20 x i32> [[TMP87]], i32 [[MULTIDISPATCHINFO]], i64 19
+; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, i32, i32, i32, i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32i32i32i32i32i32s(ptr inreg [[TMP61]], i32 inreg [[TMP57]], <20 x i32> inreg [[TMP88]], { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP34]], i32 0)
 ; CHECK-NEXT:    unreachable
 ;
 AllocaSpillBB:
@@ -473,13 +477,13 @@ AllocaSpillBB:
   store i32 %17, ptr addrspace(5) %19, align 4
   %cr = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
   %20 = load i32, ptr %csp, align 4
-  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 %20, i32 poison, i32 %17, i32 %17)
+  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 %20, i32 poison, i32 poison, i32 %17, i32 %17)
   unreachable
 }
 
 define void @test.nested.gep(i32 %cspInit) !lgc.cps !1 !lgc.shaderstage !2 !continuation !8 !continuation.state !4 {
 ; CHECK-LABEL: define {{[^@]+}}@test.nested.gep
-; CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[PAD0:%.*]], i32 inreg [[PAD1:%.*]], i32 inreg [[PAD2:%.*]], i32 inreg [[PAD3:%.*]], i32 inreg [[PAD4:%.*]], i32 inreg [[PAD5:%.*]], i32 inreg [[PAD6:%.*]], i32 inreg [[PAD7:%.*]], i32 inreg [[PAD8:%.*]], i32 inreg [[PAD9:%.*]], i32 inreg [[PAD10:%.*]], i32 inreg [[PAD11:%.*]], i32 inreg [[SPILLTABLE:%.*]], <3 x i32> inreg [[WORKGROUPID:%.*]], i32 inreg [[MULTIDISPATCHINFO:%.*]], <3 x i32> [[LOCALINVOCATIONID:%.*]], i32 [[VCR:%.*]], i32 [[CSPINIT:%.*]]) #[[ATTR1]] align 64 !lgc.cps [[META3]] !lgc.shaderstage [[META4]] !continuation [[META10:![0-9]+]] !continuation.state [[META6]] {
+; CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[PAD0:%.*]], i32 inreg [[PAD1:%.*]], i32 inreg [[PAD2:%.*]], i32 inreg [[PAD3:%.*]], i32 inreg [[PAD4:%.*]], i32 inreg [[PAD5:%.*]], i32 inreg [[PAD6:%.*]], i32 inreg [[PAD7:%.*]], i32 inreg [[PAD8:%.*]], i32 inreg [[PAD9:%.*]], i32 inreg [[PAD10:%.*]], i32 inreg [[PAD11:%.*]], i32 inreg [[SPILLTABLE:%.*]], <3 x i32> inreg [[WORKGROUPID:%.*]], i32 inreg [[MULTIDISPATCHINFO:%.*]], <3 x i32> [[LOCALINVOCATIONID:%.*]], i32 [[VCR:%.*]], i32 [[CSPINIT:%.*]]) #[[ATTR1]] align 64 !lgc.cps [[META4]] !lgc.shaderstage [[META5]] !continuation [[META11:![0-9]+]] !continuation.state [[META7]] {
 ; CHECK-NEXT:  AllocaSpillBB:
 ; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
@@ -504,67 +508,68 @@ define void @test.nested.gep(i32 %cspInit) !lgc.cps !1 !lgc.shaderstage !2 !cont
 ; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[CSP]], align 4
 ; CHECK-NEXT:    br label [[TAIL_BLOCK:%.*]]
 ; CHECK:       tail.block:
-; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } poison, <3 x i32> [[LOCALINVOCATIONID]], 0
-; CHECK-NEXT:    [[TMP17:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP16]], i32 add (i32 ptrtoint (ptr @test.1 to i32), i32 1), 1
-; CHECK-NEXT:    [[TMP18:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP17]], i32 [[TMP15]], 2
-; CHECK-NEXT:    [[TMP19:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP18]], i32 poison, 3
-; CHECK-NEXT:    [[TMP20:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP19]], i32 [[TMP12]], 4
-; CHECK-NEXT:    [[TMP21:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP20]], i32 [[TMP12]], 5
-; CHECK-NEXT:    [[TMP22:%.*]] = extractvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP21]], 1
-; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 [[TMP22]], i32 [[VCR]])
-; CHECK-NEXT:    [[TMP24:%.*]] = and i32 [[TMP23]], 7
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0
-; CHECK-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP25]])
-; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i32 [[TMP24]], 3
-; CHECK-NEXT:    [[TMP28:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP27]])
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
-; CHECK-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[TMP28]], i32 [[TMP26]]
-; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[TMP24]], 2
-; CHECK-NEXT:    [[TMP32:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP31]])
-; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne i32 [[TMP32]], 0
-; CHECK-NEXT:    [[TMP34:%.*]] = select i1 [[TMP33]], i32 [[TMP32]], i32 [[TMP30]]
-; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[TMP24]], 1
-; CHECK-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP35]])
-; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
-; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP36]], i32 [[TMP34]]
-; CHECK-NEXT:    [[TMP39:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP38]], i1 true)
-; CHECK-NEXT:    [[TMP40:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP23]], i32 [[TMP39]])
-; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq i32 [[TMP23]], [[TMP40]]
-; CHECK-NEXT:    [[TMP42:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP41]])
-; CHECK-NEXT:    [[TMP43:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP40]])
-; CHECK-NEXT:    [[TMP44:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP42]])
-; CHECK-NEXT:    [[TMP45:%.*]] = and i32 [[TMP43]], -64
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP45]], i64 0
-; CHECK-NEXT:    [[TMP47:%.*]] = bitcast <2 x i32> [[TMP46]] to i64
-; CHECK-NEXT:    [[TMP48:%.*]] = inttoptr i64 [[TMP47]] to ptr
-; CHECK-NEXT:    [[TMP49:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
-; CHECK-NEXT:    [[TMP50:%.*]] = bitcast i64 [[TMP49]] to <2 x i32>
-; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <2 x i32> [[TMP50]], i64 0
-; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <2 x i32> [[TMP50]], i64 1
-; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
-; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
-; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
-; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
-; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <20 x i32> [[TMP56]], i32 [[TMP51]], i64 1
-; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <20 x i32> [[TMP57]], i32 [[TMP52]], i64 2
-; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <20 x i32> [[TMP58]], i32 [[PAD0]], i64 3
-; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <20 x i32> [[TMP59]], i32 [[PAD1]], i64 4
-; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <20 x i32> [[TMP60]], i32 [[PAD2]], i64 5
-; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <20 x i32> [[TMP61]], i32 [[PAD3]], i64 6
-; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <20 x i32> [[TMP62]], i32 [[PAD4]], i64 7
-; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <20 x i32> [[TMP63]], i32 [[PAD5]], i64 8
-; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <20 x i32> [[TMP64]], i32 [[PAD6]], i64 9
-; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <20 x i32> [[TMP65]], i32 [[PAD7]], i64 10
-; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <20 x i32> [[TMP66]], i32 [[PAD8]], i64 11
-; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <20 x i32> [[TMP67]], i32 [[PAD9]], i64 12
-; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <20 x i32> [[TMP68]], i32 [[PAD10]], i64 13
-; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <20 x i32> [[TMP69]], i32 [[PAD11]], i64 14
-; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <20 x i32> [[TMP70]], i32 [[SPILLTABLE]], i64 15
-; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <20 x i32> [[TMP71]], i32 [[TMP53]], i64 16
-; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <20 x i32> [[TMP72]], i32 [[TMP54]], i64 17
-; CHECK-NEXT:    [[TMP74:%.*]] = insertelement <20 x i32> [[TMP73]], i32 [[TMP55]], i64 18
-; CHECK-NEXT:    [[TMP75:%.*]] = insertelement <20 x i32> [[TMP74]], i32 [[MULTIDISPATCHINFO]], i64 19
-; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, i32, i32, i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32i32i32i32i32s(ptr inreg [[TMP48]], i32 inreg [[TMP44]], <20 x i32> inreg [[TMP75]], { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP21]], i32 0)
+; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } poison, <3 x i32> [[LOCALINVOCATIONID]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP16]], i32 add (i32 ptrtoint (ptr @test.1 to i32), i32 1), 1
+; CHECK-NEXT:    [[TMP18:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP17]], i32 [[TMP15]], 2
+; CHECK-NEXT:    [[TMP19:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP18]], i32 poison, 3
+; CHECK-NEXT:    [[TMP20:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP19]], i32 poison, 4
+; CHECK-NEXT:    [[TMP21:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP20]], i32 [[TMP12]], 5
+; CHECK-NEXT:    [[TMP22:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP21]], i32 [[TMP12]], 6
+; CHECK-NEXT:    [[TMP23:%.*]] = extractvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP22]], 1
+; CHECK-NEXT:    [[TMP24:%.*]] = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 [[TMP23]], i32 [[VCR]])
+; CHECK-NEXT:    [[TMP25:%.*]] = and i32 [[TMP24]], 7
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0
+; CHECK-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP26]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP25]], 3
+; CHECK-NEXT:    [[TMP29:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP28]])
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
+; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], i32 [[TMP29]], i32 [[TMP27]]
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[TMP25]], 2
+; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP32]])
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0
+; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP33]], i32 [[TMP31]]
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP25]], 1
+; CHECK-NEXT:    [[TMP37:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP36]])
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp ne i32 [[TMP37]], 0
+; CHECK-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[TMP37]], i32 [[TMP35]]
+; CHECK-NEXT:    [[TMP40:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP39]], i1 true)
+; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP24]], i32 [[TMP40]])
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i32 [[TMP24]], [[TMP41]]
+; CHECK-NEXT:    [[TMP43:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP42]])
+; CHECK-NEXT:    [[TMP44:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP41]])
+; CHECK-NEXT:    [[TMP45:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP43]])
+; CHECK-NEXT:    [[TMP46:%.*]] = and i32 [[TMP44]], -64
+; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP46]], i64 0
+; CHECK-NEXT:    [[TMP48:%.*]] = bitcast <2 x i32> [[TMP47]] to i64
+; CHECK-NEXT:    [[TMP49:%.*]] = inttoptr i64 [[TMP48]] to ptr
+; CHECK-NEXT:    [[TMP50:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
+; CHECK-NEXT:    [[TMP51:%.*]] = bitcast i64 [[TMP50]] to <2 x i32>
+; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <2 x i32> [[TMP51]], i64 0
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <2 x i32> [[TMP51]], i64 1
+; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
+; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
+; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
+; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
+; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <20 x i32> [[TMP57]], i32 [[TMP52]], i64 1
+; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <20 x i32> [[TMP58]], i32 [[TMP53]], i64 2
+; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <20 x i32> [[TMP59]], i32 [[PAD0]], i64 3
+; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <20 x i32> [[TMP60]], i32 [[PAD1]], i64 4
+; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <20 x i32> [[TMP61]], i32 [[PAD2]], i64 5
+; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <20 x i32> [[TMP62]], i32 [[PAD3]], i64 6
+; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <20 x i32> [[TMP63]], i32 [[PAD4]], i64 7
+; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <20 x i32> [[TMP64]], i32 [[PAD5]], i64 8
+; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <20 x i32> [[TMP65]], i32 [[PAD6]], i64 9
+; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <20 x i32> [[TMP66]], i32 [[PAD7]], i64 10
+; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <20 x i32> [[TMP67]], i32 [[PAD8]], i64 11
+; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <20 x i32> [[TMP68]], i32 [[PAD9]], i64 12
+; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <20 x i32> [[TMP69]], i32 [[PAD10]], i64 13
+; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <20 x i32> [[TMP70]], i32 [[PAD11]], i64 14
+; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <20 x i32> [[TMP71]], i32 [[SPILLTABLE]], i64 15
+; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <20 x i32> [[TMP72]], i32 [[TMP54]], i64 16
+; CHECK-NEXT:    [[TMP74:%.*]] = insertelement <20 x i32> [[TMP73]], i32 [[TMP55]], i64 17
+; CHECK-NEXT:    [[TMP75:%.*]] = insertelement <20 x i32> [[TMP74]], i32 [[TMP56]], i64 18
+; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <20 x i32> [[TMP75]], i32 [[MULTIDISPATCHINFO]], i64 19
+; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, i32, i32, i32, i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32i32i32i32i32i32s(ptr inreg [[TMP49]], i32 inreg [[TMP45]], <20 x i32> inreg [[TMP76]], { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP22]], i32 0)
 ; CHECK-NEXT:    unreachable
 ;
 AllocaSpillBB:
@@ -583,7 +588,7 @@ AllocaSpillBB:
   store i32 %5, ptr addrspace(5) %7, align 4
   %cr = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
   %8 = load i32, ptr %csp, align 4
-  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 %8, i32 poison, i32 %5, i32 %5)
+  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 %8, i32 poison, i32 poison, i32 %5, i32 %5)
   unreachable
 }
 
diff --git a/lgc/test/Transforms/CpsLowering/cps-unify-exits.lgc b/lgc/test/Transforms/CpsLowering/cps-unify-exits.lgc
index 6b9eba1f1c..443a69958f 100644
--- a/lgc/test/Transforms/CpsLowering/cps-unify-exits.lgc
+++ b/lgc/test/Transforms/CpsLowering/cps-unify-exits.lgc
@@ -5,7 +5,7 @@ declare void @lgc.cps.jump(...) #0
 
 define void @unify_jumps(i32 %arg, ptr %table) !lgc.cps !1 !lgc.shaderstage !2 {
 ; CHECK-LABEL: define {{[^@]+}}@unify_jumps
-; CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[PAD0:%.*]], i32 inreg [[PAD1:%.*]], i32 inreg [[PAD2:%.*]], i32 inreg [[PAD3:%.*]], i32 inreg [[PAD4:%.*]], i32 inreg [[PAD5:%.*]], i32 inreg [[PAD6:%.*]], i32 inreg [[PAD7:%.*]], i32 inreg [[PAD8:%.*]], i32 inreg [[PAD9:%.*]], i32 inreg [[PAD10:%.*]], i32 inreg [[PAD11:%.*]], i32 inreg [[SPILLTABLE:%.*]], <3 x i32> inreg [[WORKGROUPID:%.*]], i32 inreg [[MULTIDISPATCHINFO:%.*]], <3 x i32> [[LOCALINVOCATIONID:%.*]], i32 [[VCR:%.*]], i32 [[ARG:%.*]], ptr [[TABLE:%.*]]) #[[ATTR1:[0-9]+]] align 64 !lgc.cps [[META3:![0-9]+]] !lgc.shaderstage [[META4:![0-9]+]] {
+; CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[PAD0:%.*]], i32 inreg [[PAD1:%.*]], i32 inreg [[PAD2:%.*]], i32 inreg [[PAD3:%.*]], i32 inreg [[PAD4:%.*]], i32 inreg [[PAD5:%.*]], i32 inreg [[PAD6:%.*]], i32 inreg [[PAD7:%.*]], i32 inreg [[PAD8:%.*]], i32 inreg [[PAD9:%.*]], i32 inreg [[PAD10:%.*]], i32 inreg [[PAD11:%.*]], i32 inreg [[SPILLTABLE:%.*]], <3 x i32> inreg [[WORKGROUPID:%.*]], i32 inreg [[MULTIDISPATCHINFO:%.*]], <3 x i32> [[LOCALINVOCATIONID:%.*]], i32 [[VCR:%.*]], i32 [[ARG:%.*]], ptr [[TABLE:%.*]]) #[[ATTR1:[0-9]+]] align 64 !lgc.cps [[META4:![0-9]+]] !lgc.shaderstage [[META5:![0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -31,69 +31,71 @@ define void @unify_jumps(i32 %arg, ptr %table) !lgc.cps !1 !lgc.shaderstage !2 {
 ; CHECK-NEXT:    [[TMP8:%.*]] = phi i32 [ [[CR_ELSE]], [[ELSE]] ], [ [[CR_THEN]], [[THEN]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = phi i32 [ poison, [[ELSE]] ], [ poison, [[THEN]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ELSE]] ], [ poison, [[THEN]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = phi i32 [ [[TMP7]], [[ELSE]] ], [ [[THEN_ARG]], [[THEN]] ]
-; CHECK-NEXT:    [[TMP12:%.*]] = phi i32 [ 5, [[ELSE]] ], [ poison, [[THEN]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } poison, <3 x i32> [[LOCALINVOCATIONID]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP13]], i32 [[TMP8]], 1
-; CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP14]], i32 [[TMP9]], 2
-; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP15]], i32 [[TMP10]], 3
-; CHECK-NEXT:    [[TMP17:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP16]], i32 [[TMP11]], 4
-; CHECK-NEXT:    [[TMP18:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP17]], i32 [[TMP12]], 5
-; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP18]], 1
-; CHECK-NEXT:    [[TMP20:%.*]] = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 [[TMP19]], i32 [[VCR]])
-; CHECK-NEXT:    [[TMP21:%.*]] = and i32 [[TMP20]], 7
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
-; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP22]])
-; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[TMP21]], 3
+; CHECK-NEXT:    [[TMP11:%.*]] = phi i32 [ poison, [[ELSE]] ], [ poison, [[THEN]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i32 [ [[TMP7]], [[ELSE]] ], [ [[THEN_ARG]], [[THEN]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi i32 [ 5, [[ELSE]] ], [ poison, [[THEN]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } poison, <3 x i32> [[LOCALINVOCATIONID]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP14]], i32 [[TMP8]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP15]], i32 [[TMP9]], 2
+; CHECK-NEXT:    [[TMP17:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP16]], i32 [[TMP10]], 3
+; CHECK-NEXT:    [[TMP18:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP17]], i32 [[TMP11]], 4
+; CHECK-NEXT:    [[TMP19:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP18]], i32 [[TMP12]], 5
+; CHECK-NEXT:    [[TMP20:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP19]], i32 [[TMP13]], 6
+; CHECK-NEXT:    [[TMP21:%.*]] = extractvalue { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP20]], 1
+; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 [[TMP21]], i32 [[VCR]])
+; CHECK-NEXT:    [[TMP23:%.*]] = and i32 [[TMP22]], 7
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
 ; CHECK-NEXT:    [[TMP25:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP24]])
-; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0
-; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[TMP25]], i32 [[TMP23]]
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP21]], 2
-; CHECK-NEXT:    [[TMP29:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP28]])
-; CHECK-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
-; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], i32 [[TMP29]], i32 [[TMP27]]
-; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[TMP21]], 1
-; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP32]])
-; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0
-; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP33]], i32 [[TMP31]]
-; CHECK-NEXT:    [[TMP36:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP35]], i1 true)
-; CHECK-NEXT:    [[TMP37:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP20]], i32 [[TMP36]])
-; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i32 [[TMP20]], [[TMP37]]
-; CHECK-NEXT:    [[TMP39:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP38]])
-; CHECK-NEXT:    [[TMP40:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP37]])
-; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP39]])
-; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP40]], -64
-; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP42]], i64 0
-; CHECK-NEXT:    [[TMP44:%.*]] = bitcast <2 x i32> [[TMP43]] to i64
-; CHECK-NEXT:    [[TMP45:%.*]] = inttoptr i64 [[TMP44]] to ptr
-; CHECK-NEXT:    [[TMP46:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
-; CHECK-NEXT:    [[TMP47:%.*]] = bitcast i64 [[TMP46]] to <2 x i32>
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x i32> [[TMP47]], i64 0
-; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <2 x i32> [[TMP47]], i64 1
-; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
-; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
-; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <20 x i32> [[TMP53]], i32 [[TMP48]], i64 1
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <20 x i32> [[TMP54]], i32 [[TMP49]], i64 2
-; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <20 x i32> [[TMP55]], i32 [[PAD0]], i64 3
-; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <20 x i32> [[TMP56]], i32 [[PAD1]], i64 4
-; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <20 x i32> [[TMP57]], i32 [[PAD2]], i64 5
-; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <20 x i32> [[TMP58]], i32 [[PAD3]], i64 6
-; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <20 x i32> [[TMP59]], i32 [[PAD4]], i64 7
-; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <20 x i32> [[TMP60]], i32 [[PAD5]], i64 8
-; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <20 x i32> [[TMP61]], i32 [[PAD6]], i64 9
-; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <20 x i32> [[TMP62]], i32 [[PAD7]], i64 10
-; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <20 x i32> [[TMP63]], i32 [[PAD8]], i64 11
-; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <20 x i32> [[TMP64]], i32 [[PAD9]], i64 12
-; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <20 x i32> [[TMP65]], i32 [[PAD10]], i64 13
-; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <20 x i32> [[TMP66]], i32 [[PAD11]], i64 14
-; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <20 x i32> [[TMP67]], i32 [[SPILLTABLE]], i64 15
-; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <20 x i32> [[TMP68]], i32 [[TMP50]], i64 16
-; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <20 x i32> [[TMP69]], i32 [[TMP51]], i64 17
-; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <20 x i32> [[TMP70]], i32 [[TMP52]], i64 18
-; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <20 x i32> [[TMP71]], i32 [[MULTIDISPATCHINFO]], i64 19
-; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, i32, i32, i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32i32i32i32i32s(ptr inreg [[TMP45]], i32 inreg [[TMP41]], <20 x i32> inreg [[TMP72]], { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP18]], i32 0)
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[TMP23]], 3
+; CHECK-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP26]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0
+; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP27]], i32 [[TMP25]]
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[TMP23]], 2
+; CHECK-NEXT:    [[TMP31:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP30]])
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
+; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[TMP31]], i32 [[TMP29]]
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i32 [[TMP23]], 1
+; CHECK-NEXT:    [[TMP35:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP34]])
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne i32 [[TMP35]], 0
+; CHECK-NEXT:    [[TMP37:%.*]] = select i1 [[TMP36]], i32 [[TMP35]], i32 [[TMP33]]
+; CHECK-NEXT:    [[TMP38:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP37]], i1 true)
+; CHECK-NEXT:    [[TMP39:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP38]])
+; CHECK-NEXT:    [[TMP40:%.*]] = icmp eq i32 [[TMP22]], [[TMP39]]
+; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP40]])
+; CHECK-NEXT:    [[TMP42:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP39]])
+; CHECK-NEXT:    [[TMP43:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP41]])
+; CHECK-NEXT:    [[TMP44:%.*]] = and i32 [[TMP42]], -64
+; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP44]], i64 0
+; CHECK-NEXT:    [[TMP46:%.*]] = bitcast <2 x i32> [[TMP45]] to i64
+; CHECK-NEXT:    [[TMP47:%.*]] = inttoptr i64 [[TMP46]] to ptr
+; CHECK-NEXT:    [[TMP48:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
+; CHECK-NEXT:    [[TMP49:%.*]] = bitcast i64 [[TMP48]] to <2 x i32>
+; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <2 x i32> [[TMP49]], i64 0
+; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <2 x i32> [[TMP49]], i64 1
+; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
+; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
+; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <20 x i32> [[TMP55]], i32 [[TMP50]], i64 1
+; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <20 x i32> [[TMP56]], i32 [[TMP51]], i64 2
+; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <20 x i32> [[TMP57]], i32 [[PAD0]], i64 3
+; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <20 x i32> [[TMP58]], i32 [[PAD1]], i64 4
+; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <20 x i32> [[TMP59]], i32 [[PAD2]], i64 5
+; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <20 x i32> [[TMP60]], i32 [[PAD3]], i64 6
+; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <20 x i32> [[TMP61]], i32 [[PAD4]], i64 7
+; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <20 x i32> [[TMP62]], i32 [[PAD5]], i64 8
+; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <20 x i32> [[TMP63]], i32 [[PAD6]], i64 9
+; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <20 x i32> [[TMP64]], i32 [[PAD7]], i64 10
+; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <20 x i32> [[TMP65]], i32 [[PAD8]], i64 11
+; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <20 x i32> [[TMP66]], i32 [[PAD9]], i64 12
+; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <20 x i32> [[TMP67]], i32 [[PAD10]], i64 13
+; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <20 x i32> [[TMP68]], i32 [[PAD11]], i64 14
+; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <20 x i32> [[TMP69]], i32 [[SPILLTABLE]], i64 15
+; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <20 x i32> [[TMP70]], i32 [[TMP52]], i64 16
+; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <20 x i32> [[TMP71]], i32 [[TMP53]], i64 17
+; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <20 x i32> [[TMP72]], i32 [[TMP54]], i64 18
+; CHECK-NEXT:    [[TMP74:%.*]] = insertelement <20 x i32> [[TMP73]], i32 [[MULTIDISPATCHINFO]], i64 19
+; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, i32, i32, i32, i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32i32i32i32i32i32s(ptr inreg [[TMP47]], i32 inreg [[TMP43]], <20 x i32> inreg [[TMP74]], { <3 x i32>, i32, i32, i32, i32, i32, i32 } [[TMP20]], i32 0)
 ; CHECK-NEXT:    unreachable
 ;
 entry:
@@ -104,20 +106,20 @@ then:                                             ; preds = %entry
   %table.0 = getelementptr i32, ptr %table, i32 0
   %cr.then = load i32, ptr %table.0, align 4
   %then.arg = add i32 %arg, 1
-  call void (...) @lgc.cps.jump(i32 %cr.then, i32 2, i32 poison, i32 poison, i32 %then.arg)
+  call void (...) @lgc.cps.jump(i32 %cr.then, i32 2, i32 poison, i32 poison, i32 poison, i32 %then.arg)
   unreachable
 
 else:                                             ; preds = %entry
   %table.1 = getelementptr i32, ptr %table, i32 1
   %cr.else = load i32, ptr %table.1, align 4
   %else.arg = uitofp i32 %arg to float
-  call void (...) @lgc.cps.jump(i32 %cr.else, i32 2, i32 poison, i32 poison, float %else.arg, i32 5)
+  call void (...) @lgc.cps.jump(i32 %cr.else, i32 2, i32 poison, i32 poison, i32 poison, float %else.arg, i32 5)
   unreachable
 }
 
 define void @unify_jump_ret(i32 %arg, ptr %table) !lgc.cps !1 !lgc.shaderstage !2 {
 ; CHECK-LABEL: define {{[^@]+}}@unify_jump_ret
-; CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[PAD0:%.*]], i32 inreg [[PAD1:%.*]], i32 inreg [[PAD2:%.*]], i32 inreg [[PAD3:%.*]], i32 inreg [[PAD4:%.*]], i32 inreg [[PAD5:%.*]], i32 inreg [[PAD6:%.*]], i32 inreg [[PAD7:%.*]], i32 inreg [[PAD8:%.*]], i32 inreg [[PAD9:%.*]], i32 inreg [[PAD10:%.*]], i32 inreg [[PAD11:%.*]], i32 inreg [[SPILLTABLE:%.*]], <3 x i32> inreg [[WORKGROUPID:%.*]], i32 inreg [[MULTIDISPATCHINFO:%.*]], <3 x i32> [[LOCALINVOCATIONID:%.*]], i32 [[VCR:%.*]], i32 [[ARG:%.*]], ptr [[TABLE:%.*]]) #[[ATTR1]] align 64 !lgc.cps [[META3]] !lgc.shaderstage [[META4]] {
+; CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[PAD0:%.*]], i32 inreg [[PAD1:%.*]], i32 inreg [[PAD2:%.*]], i32 inreg [[PAD3:%.*]], i32 inreg [[PAD4:%.*]], i32 inreg [[PAD5:%.*]], i32 inreg [[PAD6:%.*]], i32 inreg [[PAD7:%.*]], i32 inreg [[PAD8:%.*]], i32 inreg [[PAD9:%.*]], i32 inreg [[PAD10:%.*]], i32 inreg [[PAD11:%.*]], i32 inreg [[SPILLTABLE:%.*]], <3 x i32> inreg [[WORKGROUPID:%.*]], i32 inreg [[MULTIDISPATCHINFO:%.*]], <3 x i32> [[LOCALINVOCATIONID:%.*]], i32 [[VCR:%.*]], i32 [[ARG:%.*]], ptr [[TABLE:%.*]]) #[[ATTR1]] align 64 !lgc.cps [[META4]] !lgc.shaderstage [[META5]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -139,70 +141,72 @@ define void @unify_jump_ret(i32 %arg, ptr %table) !lgc.cps !1 !lgc.shaderstage !
 ; CHECK-NEXT:    [[TMP7:%.*]] = phi i32 [ [[CR_THEN]], [[THEN]] ], [ 0, [[ELSE]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = phi i32 [ poison, [[THEN]] ], [ poison, [[ELSE]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = phi i32 [ poison, [[THEN]] ], [ poison, [[ELSE]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = phi i32 [ [[THEN_ARG]], [[THEN]] ], [ poison, [[ELSE]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32 } poison, <3 x i32> [[LOCALINVOCATIONID]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32 } [[TMP11]], i32 [[TMP7]], 1
-; CHECK-NEXT:    [[TMP13:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32 } [[TMP12]], i32 [[TMP8]], 2
-; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32 } [[TMP13]], i32 [[TMP9]], 3
-; CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32 } [[TMP14]], i32 [[TMP10]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <3 x i32>, i32, i32, i32, i32 } [[TMP15]], 1
-; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 [[TMP16]], i32 [[VCR]])
-; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 7
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
-; CHECK-NEXT:    [[TMP20:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP19]])
-; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP18]], 3
+; CHECK-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[THEN]] ], [ poison, [[ELSE]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = phi i32 [ [[THEN_ARG]], [[THEN]] ], [ poison, [[ELSE]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } poison, <3 x i32> [[LOCALINVOCATIONID]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP12]], i32 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP13]], i32 [[TMP8]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP14]], i32 [[TMP9]], 3
+; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP15]], i32 [[TMP10]], 4
+; CHECK-NEXT:    [[TMP17:%.*]] = insertvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP16]], i32 [[TMP11]], 5
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP17]], 1
+; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 [[TMP18]], i32 [[VCR]])
+; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP19]], 7
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
 ; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP21]])
-; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
-; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP22]], i32 [[TMP20]]
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i32 [[TMP18]], 2
-; CHECK-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP25]])
-; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0
-; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 [[TMP26]], i32 [[TMP24]]
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i32 [[TMP18]], 1
-; CHECK-NEXT:    [[TMP30:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP29]])
-; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne i32 [[TMP30]], 0
-; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP30]], i32 [[TMP28]]
-; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP32]], i1 true)
-; CHECK-NEXT:    [[TMP34:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP17]], i32 [[TMP33]])
-; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[TMP17]], [[TMP34]]
-; CHECK-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP35]])
-; CHECK-NEXT:    [[TMP37:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP34]])
-; CHECK-NEXT:    [[TMP38:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP36]])
-; CHECK-NEXT:    [[TMP39:%.*]] = icmp eq i32 [[TMP37]], 0
-; CHECK-NEXT:    br i1 [[TMP39]], label [[RET_BLOCK:%.*]], label [[CHAIN_BLOCK:%.*]]
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP20]], 3
+; CHECK-NEXT:    [[TMP24:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP23]])
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0
+; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP24]], i32 [[TMP22]]
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i32 [[TMP20]], 2
+; CHECK-NEXT:    [[TMP28:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP27]])
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
+; CHECK-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[TMP28]], i32 [[TMP26]]
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[TMP20]], 1
+; CHECK-NEXT:    [[TMP32:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP31]])
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne i32 [[TMP32]], 0
+; CHECK-NEXT:    [[TMP34:%.*]] = select i1 [[TMP33]], i32 [[TMP32]], i32 [[TMP30]]
+; CHECK-NEXT:    [[TMP35:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP34]], i1 true)
+; CHECK-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP19]], i32 [[TMP35]])
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp eq i32 [[TMP19]], [[TMP36]]
+; CHECK-NEXT:    [[TMP38:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP37]])
+; CHECK-NEXT:    [[TMP39:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP36]])
+; CHECK-NEXT:    [[TMP40:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP38]])
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq i32 [[TMP39]], 0
+; CHECK-NEXT:    br i1 [[TMP41]], label [[RET_BLOCK:%.*]], label [[CHAIN_BLOCK:%.*]]
 ; CHECK:       chain.block:
-; CHECK-NEXT:    [[TMP40:%.*]] = and i32 [[TMP37]], -64
-; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP40]], i64 0
-; CHECK-NEXT:    [[TMP42:%.*]] = bitcast <2 x i32> [[TMP41]] to i64
-; CHECK-NEXT:    [[TMP43:%.*]] = inttoptr i64 [[TMP42]] to ptr
-; CHECK-NEXT:    [[TMP44:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
-; CHECK-NEXT:    [[TMP45:%.*]] = bitcast i64 [[TMP44]] to <2 x i32>
-; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <2 x i32> [[TMP45]], i64 0
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <2 x i32> [[TMP45]], i64 1
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
-; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
-; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
-; CHECK-NEXT:    [[TMP51:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <20 x i32> [[TMP51]], i32 [[TMP46]], i64 1
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <20 x i32> [[TMP52]], i32 [[TMP47]], i64 2
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <20 x i32> [[TMP53]], i32 [[PAD0]], i64 3
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <20 x i32> [[TMP54]], i32 [[PAD1]], i64 4
-; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <20 x i32> [[TMP55]], i32 [[PAD2]], i64 5
-; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <20 x i32> [[TMP56]], i32 [[PAD3]], i64 6
-; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <20 x i32> [[TMP57]], i32 [[PAD4]], i64 7
-; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <20 x i32> [[TMP58]], i32 [[PAD5]], i64 8
-; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <20 x i32> [[TMP59]], i32 [[PAD6]], i64 9
-; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <20 x i32> [[TMP60]], i32 [[PAD7]], i64 10
-; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <20 x i32> [[TMP61]], i32 [[PAD8]], i64 11
-; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <20 x i32> [[TMP62]], i32 [[PAD9]], i64 12
-; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <20 x i32> [[TMP63]], i32 [[PAD10]], i64 13
-; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <20 x i32> [[TMP64]], i32 [[PAD11]], i64 14
-; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <20 x i32> [[TMP65]], i32 [[SPILLTABLE]], i64 15
-; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <20 x i32> [[TMP66]], i32 [[TMP48]], i64 16
-; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <20 x i32> [[TMP67]], i32 [[TMP49]], i64 17
-; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <20 x i32> [[TMP68]], i32 [[TMP50]], i64 18
-; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <20 x i32> [[TMP69]], i32 [[MULTIDISPATCHINFO]], i64 19
-; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, i32, i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32i32i32i32s(ptr inreg [[TMP43]], i32 inreg [[TMP38]], <20 x i32> inreg [[TMP70]], { <3 x i32>, i32, i32, i32, i32 } [[TMP15]], i32 0)
+; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP39]], -64
+; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP42]], i64 0
+; CHECK-NEXT:    [[TMP44:%.*]] = bitcast <2 x i32> [[TMP43]] to i64
+; CHECK-NEXT:    [[TMP45:%.*]] = inttoptr i64 [[TMP44]] to ptr
+; CHECK-NEXT:    [[TMP46:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
+; CHECK-NEXT:    [[TMP47:%.*]] = bitcast i64 [[TMP46]] to <2 x i32>
+; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x i32> [[TMP47]], i64 0
+; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <2 x i32> [[TMP47]], i64 1
+; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
+; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
+; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
+; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <20 x i32> [[TMP53]], i32 [[TMP48]], i64 1
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <20 x i32> [[TMP54]], i32 [[TMP49]], i64 2
+; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <20 x i32> [[TMP55]], i32 [[PAD0]], i64 3
+; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <20 x i32> [[TMP56]], i32 [[PAD1]], i64 4
+; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <20 x i32> [[TMP57]], i32 [[PAD2]], i64 5
+; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <20 x i32> [[TMP58]], i32 [[PAD3]], i64 6
+; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <20 x i32> [[TMP59]], i32 [[PAD4]], i64 7
+; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <20 x i32> [[TMP60]], i32 [[PAD5]], i64 8
+; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <20 x i32> [[TMP61]], i32 [[PAD6]], i64 9
+; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <20 x i32> [[TMP62]], i32 [[PAD7]], i64 10
+; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <20 x i32> [[TMP63]], i32 [[PAD8]], i64 11
+; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <20 x i32> [[TMP64]], i32 [[PAD9]], i64 12
+; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <20 x i32> [[TMP65]], i32 [[PAD10]], i64 13
+; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <20 x i32> [[TMP66]], i32 [[PAD11]], i64 14
+; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <20 x i32> [[TMP67]], i32 [[SPILLTABLE]], i64 15
+; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <20 x i32> [[TMP68]], i32 [[TMP50]], i64 16
+; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <20 x i32> [[TMP69]], i32 [[TMP51]], i64 17
+; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <20 x i32> [[TMP70]], i32 [[TMP52]], i64 18
+; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <20 x i32> [[TMP71]], i32 [[MULTIDISPATCHINFO]], i64 19
+; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, i32, i32, i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32i32i32i32i32s(ptr inreg [[TMP45]], i32 inreg [[TMP40]], <20 x i32> inreg [[TMP72]], { <3 x i32>, i32, i32, i32, i32, i32 } [[TMP17]], i32 0)
 ; CHECK-NEXT:    unreachable
 ; CHECK:       ret.block:
 ; CHECK-NEXT:    ret void
@@ -215,7 +219,7 @@ then:                                             ; preds = %entry
   %table.0 = getelementptr i32, ptr %table, i32 0
   %cr.then = load i32, ptr %table.0, align 4
   %then.arg = add i32 %arg, 1
-  call void (...) @lgc.cps.jump(i32 %cr.then, i32 2, i32 poison, i32 poison, i32 %then.arg)
+  call void (...) @lgc.cps.jump(i32 %cr.then, i32 2, i32 poison, i32 poison, i32 poison, i32 %then.arg)
   unreachable
 
 else:                                             ; preds = %entry
diff --git a/lgc/test/Transforms/PatchBufferOp/InvariantStartUserWithPhiNode.lgc b/lgc/test/Transforms/LowerBufferOperations/InvariantStartUserWithPhiNode.lgc
similarity index 91%
rename from lgc/test/Transforms/PatchBufferOp/InvariantStartUserWithPhiNode.lgc
rename to lgc/test/Transforms/LowerBufferOperations/InvariantStartUserWithPhiNode.lgc
index 908bb40f9e..3ea4645cdb 100644
--- a/lgc/test/Transforms/PatchBufferOp/InvariantStartUserWithPhiNode.lgc
+++ b/lgc/test/Transforms/LowerBufferOperations/InvariantStartUserWithPhiNode.lgc
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc
-; RUN: lgc -o - -passes="require<lgc-pipeline-state>,function(lgc-patch-buffer-op)" %s | FileCheck --check-prefixes=CHECK %s
+; RUN: lgc -o - -passes="require<lgc-pipeline-state>,function(lgc-lower-buffer-operations)" %s | FileCheck --check-prefixes=CHECK %s
 
 define dllexport spir_func void @lgc.shader.CS.main(<4 x i32> inreg %desc0,<4 x i32> inreg %desc1,<4 x i32> inreg %desc2, i32 %number) local_unnamed_addr #0 {
 ; CHECK-LABEL: @lgc.shader.CS.main(
@@ -19,9 +19,9 @@ define dllexport spir_func void @lgc.shader.CS.main(<4 x i32> inreg %desc0,<4 x
 ; CHECK-NEXT:    ret void
 ;
 .entry:
-  %0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0)
-  %1 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc1)
-  %2 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc2)
+  %0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0, i1 false)
+  %1 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc1, i1 false)
+  %2 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc2, i1 false)
   %3 = icmp eq i32 %number, 2
   br i1 %3, label %branch_true, label %branch1
 
@@ -44,7 +44,7 @@ branch_merge:
 }
 
 ; Function Attrs: nounwind memory(none)
-declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>) nounwind readnone
+declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>, i1) nounwind readnone
 declare ptr @llvm.invariant.start.p7(i64 immarg, ptr addrspace(7) nocapture) #2
 
 attributes #0 = { nounwind }
diff --git a/lgc/test/Transforms/PatchBufferOp/buffer-index-op.lgc b/lgc/test/Transforms/LowerBufferOperations/buffer-index-op.lgc
similarity index 100%
rename from lgc/test/Transforms/PatchBufferOp/buffer-index-op.lgc
rename to lgc/test/Transforms/LowerBufferOperations/buffer-index-op.lgc
diff --git a/lgc/test/Transforms/PatchBufferOp/buffer.atomic.ops.lgc b/lgc/test/Transforms/LowerBufferOperations/buffer.atomic.ops.lgc
similarity index 97%
rename from lgc/test/Transforms/PatchBufferOp/buffer.atomic.ops.lgc
rename to lgc/test/Transforms/LowerBufferOperations/buffer.atomic.ops.lgc
index a1928468ae..8225ca73a2 100644
--- a/lgc/test/Transforms/PatchBufferOp/buffer.atomic.ops.lgc
+++ b/lgc/test/Transforms/LowerBufferOperations/buffer.atomic.ops.lgc
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc
-; RUN: lgc -o - -passes='require<lgc-pipeline-state>,function(lgc-patch-buffer-op)' %s | FileCheck --check-prefixes=CHECK %s
+; RUN: lgc -o - -passes='require<lgc-pipeline-state>,function(lgc-lower-buffer-operations)' %s | FileCheck --check-prefixes=CHECK %s
 
 define amdgpu_gfx void @raw_atomic_load(<4 x i32> inreg %desc) !lgc.shaderstage !0 {
 ; CHECK-LABEL: @raw_atomic_load(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32{{(\.v4i32)?}}(<4 x i32> [[DESC:%.*]], i32 0, i32 0, i32 5)
 ; CHECK-NEXT:    ret void
 ;
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %load = load atomic i32, ptr addrspace(7) %ptr monotonic, align 8
   ret void
 }
@@ -16,7 +16,7 @@ define amdgpu_gfx void @raw_atomicrmw_xchg(<4 x i32> inreg %desc) !lgc.shadersta
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.amdgcn.raw.buffer.atomic.swap.i64{{(\.v4i32)?}}(i64 1, <4 x i32> [[DESC:%.*]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %xchg = atomicrmw xchg ptr addrspace(7) %ptr, i64 1 monotonic, align 8
   ret void
 }
@@ -26,7 +26,7 @@ define amdgpu_gfx void @raw_atomicrmw_add(<4 x i32> inreg %desc) !lgc.shaderstag
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.amdgcn.raw.buffer.atomic.add.i64{{(\.v4i32)?}}(i64 1, <4 x i32> [[DESC:%.*]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %add = atomicrmw add ptr addrspace(7) %ptr, i64 1 monotonic, align 8
   ret void
 }
@@ -36,7 +36,7 @@ define amdgpu_gfx void @raw_atomicrmw_sub(<4 x i32> inreg %desc) !lgc.shaderstag
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.amdgcn.raw.buffer.atomic.sub.i64{{(\.v4i32)?}}(i64 1, <4 x i32> [[DESC:%.*]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %sub = atomicrmw sub ptr addrspace(7) %ptr, i64 1 monotonic, align 8
   ret void
 }
@@ -46,7 +46,7 @@ define amdgpu_gfx void @raw_atomicrmw_and(<4 x i32> inreg %desc) !lgc.shaderstag
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.amdgcn.raw.buffer.atomic.and.i64{{(\.v4i32)?}}(i64 1, <4 x i32> [[DESC:%.*]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %and = atomicrmw and ptr addrspace(7) %ptr, i64 1 monotonic, align 8
   ret void
 }
@@ -56,7 +56,7 @@ define amdgpu_gfx void @raw_atomicrmw_or(<4 x i32> inreg %desc) !lgc.shaderstage
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.amdgcn.raw.buffer.atomic.or.i64{{(\.v4i32)?}}(i64 1, <4 x i32> [[DESC:%.*]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %or = atomicrmw or ptr addrspace(7) %ptr, i64 1 monotonic, align 8
   ret void
 }
@@ -66,7 +66,7 @@ define amdgpu_gfx void @raw_atomicrmw_xor(<4 x i32> inreg %desc) !lgc.shaderstag
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.amdgcn.raw.buffer.atomic.xor.i64{{(\.v4i32)?}}(i64 1, <4 x i32> [[DESC:%.*]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %xor = atomicrmw xor ptr addrspace(7) %ptr, i64 1 monotonic, align 8
   ret void
 }
@@ -76,7 +76,7 @@ define amdgpu_gfx void @raw_atomicrmw_smax(<4 x i32> inreg %desc) !lgc.shadersta
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.amdgcn.raw.buffer.atomic.smax.i64{{(\.v4i32)?}}(i64 1, <4 x i32> [[DESC:%.*]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %smax = atomicrmw max ptr addrspace(7) %ptr, i64 1 monotonic, align 8
   ret void
 }
@@ -86,7 +86,7 @@ define amdgpu_gfx void @raw_atomicrmw_smin(<4 x i32> inreg %desc) !lgc.shadersta
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.amdgcn.raw.buffer.atomic.smin.i64{{(\.v4i32)?}}(i64 1, <4 x i32> [[DESC:%.*]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %smin = atomicrmw min ptr addrspace(7) %ptr, i64 1 monotonic, align 8
   ret void
 }
@@ -96,7 +96,7 @@ define amdgpu_gfx void @raw_atomicrmw_umax(<4 x i32> inreg %desc) !lgc.shadersta
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.amdgcn.raw.buffer.atomic.umax.i64{{(\.v4i32)?}}(i64 1, <4 x i32> [[DESC:%.*]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %umax = atomicrmw umax ptr addrspace(7) %ptr, i64 1 monotonic, align 8
   ret void
 }
@@ -106,7 +106,7 @@ define amdgpu_gfx void @raw_atomicrmw_umin(<4 x i32> inreg %desc) !lgc.shadersta
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.amdgcn.raw.buffer.atomic.umin.i64{{(\.v4i32)?}}(i64 1, <4 x i32> [[DESC:%.*]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %umin = atomicrmw umin ptr addrspace(7) %ptr, i64 1 monotonic, align 8
   ret void
 }
@@ -116,7 +116,7 @@ define amdgpu_gfx void @raw_atomicrmw_fadd(<4 x i32> inreg %desc) !lgc.shadersta
 ; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32{{(\.v4i32)?}}(float 1.000000e+00, <4 x i32> [[DESC:%.*]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %fadd = atomicrmw fadd ptr addrspace(7) %ptr, float 1.0 monotonic, align 8
   ret void
 }
@@ -126,7 +126,7 @@ define amdgpu_gfx void @raw_atomicrmw_fmax(<4 x i32> inreg %desc) !lgc.shadersta
 ; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32{{(\.v4i32)?}}(float 1.000000e+00, <4 x i32> [[DESC:%.*]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %fmax = atomicrmw fmax ptr addrspace(7) %ptr, float 1.0 monotonic, align 8
   ret void
 }
@@ -136,7 +136,7 @@ define amdgpu_gfx void @raw_atomicrmw_fmin(<4 x i32> inreg %desc) !lgc.shadersta
 ; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.raw.buffer.atomic.fmin.f32{{(\.v4i32)?}}(float 1.000000e+00, <4 x i32> [[DESC:%.*]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %fmin = atomicrmw fmin ptr addrspace(7) %ptr, float 1.0 monotonic, align 8
   ret void
 }
@@ -157,7 +157,7 @@ define amdgpu_gfx void @struct_atomic_load(<4 x i32> inreg %desc, i32 %index) !l
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.struct.atomic.buffer.load.i32{{(\.v4i32)?}}(<4 x i32> [[TMP11]], i32 [[INDEX:%.*]], i32 0, i32 0, i32 5)
 ; CHECK-NEXT:    ret void
 ;
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %struct.ptr = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %ptr, i32 8)
   %struct.ptr.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %struct.ptr, i32 %index)
   %load = load atomic i32, ptr addrspace(9) %struct.ptr.idx monotonic, align 8
@@ -180,7 +180,7 @@ define amdgpu_gfx void @struct_atomicrmw_xchg(<4 x i32> inreg %desc, i32 %index)
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.amdgcn.struct.buffer.atomic.swap.i64{{(\.v4i32)?}}(i64 1, <4 x i32> [[TMP11]], i32 [[INDEX:%.*]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %struct.ptr = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %ptr, i32 8)
   %struct.ptr.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %struct.ptr, i32 %index)
   %xchg = atomicrmw xchg ptr addrspace(9) %struct.ptr.idx, i64 1 monotonic, align 8
@@ -203,7 +203,7 @@ define amdgpu_gfx void @struct_atomicrmw_add(<4 x i32> inreg %desc, i32 %index)
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.amdgcn.struct.buffer.atomic.add.i64{{(\.v4i32)?}}(i64 1, <4 x i32> [[TMP11]], i32 [[INDEX:%.*]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %struct.ptr = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %ptr, i32 8)
   %struct.ptr.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %struct.ptr, i32 %index)
   %add = atomicrmw add ptr addrspace(9) %struct.ptr.idx, i64 1 monotonic, align 8
@@ -226,7 +226,7 @@ define amdgpu_gfx void @struct_atomicrmw_sub(<4 x i32> inreg %desc, i32 %index)
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.amdgcn.struct.buffer.atomic.sub.i64{{(\.v4i32)?}}(i64 1, <4 x i32> [[TMP11]], i32 [[INDEX:%.*]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %struct.ptr = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %ptr, i32 8)
   %struct.ptr.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %struct.ptr, i32 %index)
   %sub = atomicrmw sub ptr addrspace(9) %struct.ptr.idx, i64 1 monotonic, align 8
@@ -249,7 +249,7 @@ define amdgpu_gfx void @struct_atomicrmw_and(<4 x i32> inreg %desc, i32 %index)
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.amdgcn.struct.buffer.atomic.and.i64{{(\.v4i32)?}}(i64 1, <4 x i32> [[TMP11]], i32 [[INDEX:%.*]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %struct.ptr = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %ptr, i32 8)
   %struct.ptr.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %struct.ptr, i32 %index)
   %and = atomicrmw and ptr addrspace(9) %struct.ptr.idx, i64 1 monotonic, align 8
@@ -272,7 +272,7 @@ define amdgpu_gfx void @struct_atomicrmw_or(<4 x i32> inreg %desc, i32 %index) !
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.amdgcn.struct.buffer.atomic.or.i64{{(\.v4i32)?}}(i64 1, <4 x i32> [[TMP11]], i32 [[INDEX:%.*]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %struct.ptr = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %ptr, i32 8)
   %struct.ptr.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %struct.ptr, i32 %index)
   %or = atomicrmw or ptr addrspace(9) %struct.ptr.idx, i64 1 monotonic, align 8
@@ -295,7 +295,7 @@ define amdgpu_gfx void @struct_atomicrmw_xor(<4 x i32> inreg %desc, i32 %index)
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.amdgcn.struct.buffer.atomic.xor.i64{{(\.v4i32)?}}(i64 1, <4 x i32> [[TMP11]], i32 [[INDEX:%.*]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %struct.ptr = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %ptr, i32 8)
   %struct.ptr.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %struct.ptr, i32 %index)
   %xor = atomicrmw xor ptr addrspace(9) %struct.ptr.idx, i64 1 monotonic, align 8
@@ -318,7 +318,7 @@ define amdgpu_gfx void @struct_atomicrmw_smax(<4 x i32> inreg %desc, i32 %index)
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.amdgcn.struct.buffer.atomic.smax.i64{{(\.v4i32)?}}(i64 1, <4 x i32> [[TMP11]], i32 [[INDEX:%.*]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %struct.ptr = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %ptr, i32 8)
   %struct.ptr.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %struct.ptr, i32 %index)
   %smax = atomicrmw max ptr addrspace(9) %struct.ptr.idx, i64 1 monotonic, align 8
@@ -341,7 +341,7 @@ define amdgpu_gfx void @struct_atomicrmw_smin(<4 x i32> inreg %desc, i32 %index)
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.amdgcn.struct.buffer.atomic.smin.i64{{(\.v4i32)?}}(i64 1, <4 x i32> [[TMP11]], i32 [[INDEX:%.*]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %struct.ptr = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %ptr, i32 8)
   %struct.ptr.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %struct.ptr, i32 %index)
   %smin = atomicrmw min ptr addrspace(9) %struct.ptr.idx, i64 1 monotonic, align 8
@@ -364,7 +364,7 @@ define amdgpu_gfx void @struct_atomicrmw_umax(<4 x i32> inreg %desc, i32 %index)
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.amdgcn.struct.buffer.atomic.umax.i64{{(\.v4i32)?}}(i64 1, <4 x i32> [[TMP11]], i32 [[INDEX:%.*]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %struct.ptr = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %ptr, i32 8)
   %struct.ptr.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %struct.ptr, i32 %index)
   %umax = atomicrmw umax ptr addrspace(9) %struct.ptr.idx, i64 1 monotonic, align 8
@@ -387,7 +387,7 @@ define amdgpu_gfx void @struct_atomicrmw_umin(<4 x i32> inreg %desc, i32 %index)
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.amdgcn.struct.buffer.atomic.umin.i64{{(\.v4i32)?}}(i64 1, <4 x i32> [[TMP11]], i32 [[INDEX:%.*]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %struct.ptr = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %ptr, i32 8)
   %struct.ptr.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %struct.ptr, i32 %index)
   %umin = atomicrmw umin ptr addrspace(9) %struct.ptr.idx, i64 1 monotonic, align 8
@@ -410,7 +410,7 @@ define amdgpu_gfx void @struct_atomicrmw_fadd(<4 x i32> inreg %desc, i32 %index)
 ; CHECK-NEXT:    [[TMP12:%.*]] = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32{{(\.v4i32)?}}(float 1.000000e+00, <4 x i32> [[TMP11]], i32 [[INDEX:%.*]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %struct.ptr = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %ptr, i32 8)
   %struct.ptr.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %struct.ptr, i32 %index)
   %fadd = atomicrmw fadd ptr addrspace(9) %struct.ptr.idx, float 1.0 monotonic, align 8
@@ -433,7 +433,7 @@ define amdgpu_gfx void @struct_atomicrmw_fmax(<4 x i32> inreg %desc, i32 %index)
 ; CHECK-NEXT:    [[TMP12:%.*]] = call float @llvm.amdgcn.struct.buffer.atomic.fmax.f32{{(\.v4i32)?}}(float 1.000000e+00, <4 x i32> [[TMP11]], i32 [[INDEX:%.*]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %struct.ptr = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %ptr, i32 8)
   %struct.ptr.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %struct.ptr, i32 %index)
   %fmax = atomicrmw fmax ptr addrspace(9) %struct.ptr.idx, float 1.0 monotonic, align 8
@@ -456,14 +456,14 @@ define amdgpu_gfx void @struct_atomicrmw_fmin(<4 x i32> inreg %desc, i32 %index)
 ; CHECK-NEXT:    [[TMP12:%.*]] = call float @llvm.amdgcn.struct.buffer.atomic.fmin.f32{{(\.v4i32)?}}(float 1.000000e+00, <4 x i32> [[TMP11]], i32 [[INDEX:%.*]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %struct.ptr = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %ptr, i32 8)
   %struct.ptr.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %struct.ptr, i32 %index)
   %fmin = atomicrmw fmin ptr addrspace(9) %struct.ptr.idx, float 1.0 monotonic, align 8
   ret void
 }
 
-declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>) nounwind readnone
+declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>, i1) nounwind readnone
 declare ptr addrspace(7) @lgc.buffer.index(ptr addrspace(7), i32, i32)
 declare ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7), i32)
 declare ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9), i32)
diff --git a/lgc/test/Transforms/PatchBufferOp/simple.lgc b/lgc/test/Transforms/LowerBufferOperations/simple.lgc
similarity index 67%
rename from lgc/test/Transforms/PatchBufferOp/simple.lgc
rename to lgc/test/Transforms/LowerBufferOperations/simple.lgc
index 8bab179202..dd8d434598 100644
--- a/lgc/test/Transforms/PatchBufferOp/simple.lgc
+++ b/lgc/test/Transforms/LowerBufferOperations/simple.lgc
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc
-; RUN: lgc -o - -passes='require<lgc-pipeline-state>,function(lgc-patch-buffer-op)' %s | FileCheck --check-prefixes=CHECK %s
+; RUN: lgc -o - -passes='require<lgc-pipeline-state>,function(lgc-lower-buffer-operations)' %s | FileCheck --check-prefixes=CHECK %s
 
 define amdgpu_gfx float @simple(<4 x i32> inreg %desc) !lgc.shaderstage !0 {
 ; CHECK-LABEL: @simple(
@@ -7,7 +7,7 @@ define amdgpu_gfx float @simple(<4 x i32> inreg %desc) !lgc.shaderstage !0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[TMP1]] to float
 ; CHECK-NEXT:    ret float [[TMP2]]
 ;
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %r = load float, ptr addrspace(7) %ptr
   ret float %r
 }
@@ -19,8 +19,8 @@ define amdgpu_gfx float @uniform_select(<4 x i32> inreg %desc0, <4 x i32> inreg
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[TMP1]] to float
 ; CHECK-NEXT:    ret float [[TMP2]]
 ;
-  %ptr0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0)
-  %ptr1 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc1)
+  %ptr0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0, i1 false)
+  %ptr1 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc1, i1 false)
   %ptr = select i1 %sel, ptr addrspace(7) %ptr0, ptr addrspace(7) %ptr1
   %r = load float, ptr addrspace(7) %ptr
   ret float %r
@@ -40,8 +40,8 @@ define amdgpu_gfx float @divergent_select(<4 x i32> inreg %desc0, <4 x i32> inre
 ; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(1) [[TMP8]], align 4
 ; CHECK-NEXT:    ret float [[TMP9]]
 ;
-  %ptr0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0)
-  %ptr1 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc1)
+  %ptr0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0, i1 false)
+  %ptr1 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc1, i1 false)
   %ptr = select i1 %sel, ptr addrspace(7) %ptr0, ptr addrspace(7) %ptr1
   %r = load float, ptr addrspace(7) %ptr
   ret float %r
@@ -61,8 +61,8 @@ define amdgpu_gfx float @divergent_select1(<4 x i32> %desc0, <4 x i32> inreg %de
 ; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(1) [[TMP8]], align 4
 ; CHECK-NEXT:    ret float [[TMP9]]
 ;
-  %ptr0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0)
-  %ptr1 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc1)
+  %ptr0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0, i1 false)
+  %ptr1 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc1, i1 false)
   %ptr = select i1 %sel, ptr addrspace(7) %ptr0, ptr addrspace(7) %ptr1
   %r = load float, ptr addrspace(7) %ptr
   ret float %r
@@ -82,8 +82,8 @@ define amdgpu_gfx float @divergent_select2(<4 x i32> inreg %desc0, <4 x i32> %de
 ; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(1) [[TMP8]], align 4
 ; CHECK-NEXT:    ret float [[TMP9]]
 ;
-  %ptr0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0)
-  %ptr1 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc1)
+  %ptr0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0, i1 false)
+  %ptr1 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc1, i1 false)
   %ptr = select i1 %sel, ptr addrspace(7) %ptr0, ptr addrspace(7) %ptr1
   %r = load float, ptr addrspace(7) %ptr
   ret float %r
@@ -106,7 +106,7 @@ define amdgpu_gfx void @divergent_phi_uniform_desc(<4 x i32> inreg %desc0, i32 %
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %ptr.0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0)
+  %ptr.0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0, i1 false)
   br label %loop
 
 loop:
@@ -145,11 +145,11 @@ define amdgpu_gfx float @divergent_input0_phi(<4 x i32> %desc0, <4 x i32> inreg
   br i1 %sel, label %a, label %b
 
 a:
-  %ptr0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0)
+  %ptr0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0, i1 false)
   br label %tail
 
 b:
-  %ptr1 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc1)
+  %ptr1 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc1, i1 false)
   br label %tail
 
 tail:
@@ -181,11 +181,11 @@ define amdgpu_gfx float @divergent_input1_phi(<4 x i32> inreg %desc0, <4 x i32>
   br i1 %sel, label %a, label %b
 
 a:
-  %ptr0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0)
+  %ptr0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0, i1 false)
   br label %tail
 
 b:
-  %ptr1 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc1)
+  %ptr1 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc1, i1 false)
   br label %tail
 
 tail:
@@ -217,11 +217,11 @@ define amdgpu_gfx float @divergent_sync_phi(<4 x i32> inreg %desc0, <4 x i32> in
   br i1 %sel, label %a, label %b
 
 a:
-  %ptr0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0)
+  %ptr0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0, i1 false)
   br label %tail
 
 b:
-  %ptr1 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc1)
+  %ptr1 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc1, i1 false)
   br label %tail
 
 tail:
@@ -230,6 +230,92 @@ tail:
   ret float %r
 }
 
-declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>) nounwind readnone
+define amdgpu_gfx float @buffer_load_desc_to_ptr(ptr addrspace(7) inreg %desc) !lgc.shaderstage !0 {
+; CHECK-LABEL: @buffer_load_desc_to_ptr(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(7) [[DESC:%.*]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.amdgcn.raw.buffer.load.i32{{(\.v4i32)?}}(<4 x i32> [[TMP1]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
+; CHECK-NEXT:    ret float [[TMP3]]
+;
+  %ptr = call ptr addrspace(7) @lgc.buffer.load.desc.to.ptr(ptr addrspace(7) %desc, i1 false, i1 false, i1 false)
+  %r = load float, ptr addrspace(7) %ptr
+  ret float %r
+}
+
+define amdgpu_gfx float @load_desc_uniform_select(ptr addrspace(7) inreg %desc0, ptr addrspace(7) inreg %desc1, i1 inreg %sel) !lgc.shaderstage !0 {
+; CHECK-LABEL: @load_desc_uniform_select(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(7) [[DESC0:%.*]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(7) [[DESC1:%.*]], align 16
+; CHECK-NEXT:    [[PTR_0:%.*]] = select i1 [[SEL:%.*]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.raw.buffer.load.i32{{(\.v4i32)?}}(<4 x i32> [[PTR_0]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP3]] to float
+; CHECK-NEXT:    ret float [[TMP4]]
+;
+  %ptr0 = call ptr addrspace(7) @lgc.buffer.load.desc.to.ptr(ptr addrspace(7) %desc0, i1 false, i1 false, i1 false)
+  %ptr1 = call ptr addrspace(7) @lgc.buffer.load.desc.to.ptr(ptr addrspace(7) %desc1, i1 false, i1 false, i1 false)
+  %ptr = select i1 %sel, ptr addrspace(7) %ptr0, ptr addrspace(7) %ptr1
+  %r = load float, ptr addrspace(7) %ptr
+  ret float %r
+}
+
+define amdgpu_gfx float @load_desc_divergent_select(ptr addrspace(7) inreg %desc0, ptr addrspace(7) inreg %desc1, i1 %sel) !lgc.shaderstage !0 {
+; CHECK-LABEL: @load_desc_divergent_select(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(7) [[DESC0:%.*]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(7) [[DESC1:%.*]], align 16
+; CHECK-NEXT:    [[PTR_0:%.*]] = select i1 [[SEL:%.*]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i32 0, [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = and <2 x i32> [[TMP5]], <i32 -1, i32 65535>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr addrspace(1)
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP4]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP8]], i32 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr addrspace(1) [[TMP10]], align 4
+; CHECK-NEXT:    ret float [[TMP11]]
+;
+  %ptr0 = call ptr addrspace(7) @lgc.buffer.load.desc.to.ptr(ptr addrspace(7) %desc0, i1 false, i1 false, i1 false)
+  %ptr1 = call ptr addrspace(7) @lgc.buffer.load.desc.to.ptr(ptr addrspace(7) %desc1, i1 false, i1 false, i1 false)
+  %ptr = select i1 %sel, ptr addrspace(7) %ptr0, ptr addrspace(7) %ptr1
+  %r = load float, ptr addrspace(7) %ptr
+  ret float %r
+}
+
+define amdgpu_gfx void @load_desc_phi(ptr addrspace(7) inreg %desc, i32 %stride) !lgc.shaderstage !0 {
+; CHECK-LABEL: @load_desc_phi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(7) [[DESC:%.*]], align 16
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PTR_PHI_1:%.*]] = phi ptr addrspace(6) [ null, [[ENTRY:%.*]] ], [ [[TMP2:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[CTR:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[CTR_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(6) [[PTR_PHI_1]] to i32
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.buffer.store.i32{{(\.v4i32)?}}(i32 0, <4 x i32> [[TMP0]], i32 [[TMP1]], i32 0, i32 0)
+; CHECK-NEXT:    [[TMP2]] = getelementptr i32, ptr addrspace(6) [[PTR_PHI_1]], i32 [[STRIDE:%.*]]
+; CHECK-NEXT:    [[CTR_NEXT]] = add i32 [[CTR]], 1
+; CHECK-NEXT:    [[CC:%.*]] = icmp ne i32 [[CTR_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[CC]], label [[LOOP]], label [[END:%.*]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ptr.0 = call ptr addrspace(7) @lgc.buffer.load.desc.to.ptr(ptr addrspace(7) %desc, i1 false, i1 false, i1 false)
+  br label %loop
+
+loop:
+  %ptr.phi = phi ptr addrspace(7) [ %ptr.0, %entry ], [ %ptr.next, %loop ]
+  %ctr = phi i32 [ 0, %entry ], [ %ctr.next, %loop ]
+  store i32 0, ptr addrspace(7) %ptr.phi
+  %ptr.next = getelementptr i32, ptr addrspace(7) %ptr.phi, i32 %stride
+  %ctr.next = add i32 %ctr, 1
+  %cc = icmp ne i32 %ctr.next, 1024
+  br i1 %cc, label %loop, label %end
+
+end:
+  ret void
+}
+
+declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>, i1) nounwind readnone
+declare ptr addrspace(7) @lgc.buffer.load.desc.to.ptr(ptr addrspace(7), i1, i1, i1) nounwind readnone
 
 !0 = !{i32 7}
diff --git a/lgc/test/Transforms/PatchBufferOp/strided-buffer-ops.lgc b/lgc/test/Transforms/LowerBufferOperations/strided-buffer-ops.lgc
similarity index 97%
rename from lgc/test/Transforms/PatchBufferOp/strided-buffer-ops.lgc
rename to lgc/test/Transforms/LowerBufferOperations/strided-buffer-ops.lgc
index 6e207d81c3..82c2be38c5 100644
--- a/lgc/test/Transforms/PatchBufferOp/strided-buffer-ops.lgc
+++ b/lgc/test/Transforms/LowerBufferOperations/strided-buffer-ops.lgc
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 2
-; RUN: lgc --mcpu=gfx1100 -o - -passes="require<lgc-pipeline-state>,module(lgc-lower-desc),module(lgc-mutate-entry-point),function(lgc-patch-buffer-op)" %s | FileCheck --check-prefixes=GFX11 %s
+; RUN: lgc --mcpu=gfx1100 -o - -passes="require<lgc-pipeline-state>,module(lgc-lower-desc),module(lgc-mutate-entry-point),function(lgc-lower-buffer-operations)" %s | FileCheck --check-prefixes=GFX11 %s
 
 define amdgpu_kernel void @strided_buffer_desc_to_ptr(<4 x i32> inreg %desc, ptr %out) {
 ; GFX11-LABEL: define amdgpu_gfx void @strided_buffer_desc_to_ptr
@@ -11,7 +11,7 @@ define amdgpu_kernel void @strided_buffer_desc_to_ptr(<4 x i32> inreg %desc, ptr
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %buf = call ptr addrspace(9) @lgc.strided.buffer.desc.to.ptr(<4 x i32> %desc)
+  %buf = call ptr addrspace(9) @lgc.strided.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %res = load float, ptr addrspace(9) %buf, align 4
   store float %res, ptr %out, align 4
   ret void
@@ -27,7 +27,7 @@ define amdgpu_kernel void @strided_buffer_desc_to_ptr_index(<4 x i32> inreg %des
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %buf = call ptr addrspace(9) @lgc.strided.buffer.desc.to.ptr(<4 x i32> %desc)
+  %buf = call ptr addrspace(9) @lgc.strided.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %buf.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %buf, i32 %index)
   %res = load float, ptr addrspace(9) %buf.idx, align 4
   store float %res, ptr %out, align 4
@@ -44,7 +44,7 @@ define amdgpu_kernel void @strided_buffer_desc_to_ptr_index_offset(<4 x i32> inr
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %buf = call ptr addrspace(9) @lgc.strided.buffer.desc.to.ptr(<4 x i32> %desc)
+  %buf = call ptr addrspace(9) @lgc.strided.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %buf.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %buf, i32 %index)
   %buf.off = getelementptr inbounds i8, ptr addrspace(9) %buf.idx, i32 8
   %res = load float, ptr addrspace(9) %buf.off, align 4
@@ -62,7 +62,7 @@ define amdgpu_kernel void @strided_buffer_desc_to_ptr_index_add_zero(<4 x i32> i
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %buf = call ptr addrspace(9) @lgc.strided.buffer.desc.to.ptr(<4 x i32> %desc)
+  %buf = call ptr addrspace(9) @lgc.strided.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %buf.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %buf, i32 0)
   %res = load float, ptr addrspace(9) %buf.idx, align 4
   store float %res, ptr %out, align 4
@@ -80,7 +80,7 @@ define amdgpu_kernel void @strided_buffer_desc_to_ptr_index_add_twice(<4 x i32>
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %buf = call ptr addrspace(9) @lgc.strided.buffer.desc.to.ptr(<4 x i32> %desc)
+  %buf = call ptr addrspace(9) @lgc.strided.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %buf.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %buf, i32 %index)
   %buf.idx.2 = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %buf.idx, i32 %index)
   %res = load float, ptr addrspace(9) %buf.idx.2, align 4
@@ -99,7 +99,7 @@ define amdgpu_kernel void @strided_buffer_desc_to_ptr_index_add_twice_constant_n
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %buf = call ptr addrspace(9) @lgc.strided.buffer.desc.to.ptr(<4 x i32> %desc)
+  %buf = call ptr addrspace(9) @lgc.strided.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %buf.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %buf, i32 %index)
   %buf.idx.2 = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %buf.idx, i32 4)
   %res = load float, ptr addrspace(9) %buf.idx.2, align 4
@@ -118,7 +118,7 @@ define amdgpu_kernel void @strided_buffer_desc_to_ptr_index_add_twice_constant_o
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %buf = call ptr addrspace(9) @lgc.strided.buffer.desc.to.ptr(<4 x i32> %desc)
+  %buf = call ptr addrspace(9) @lgc.strided.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %buf.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %buf, i32 4)
   %buf.idx.2 = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %buf.idx, i32 %index)
   %res = load float, ptr addrspace(9) %buf.idx.2, align 4
@@ -136,7 +136,7 @@ define amdgpu_kernel void @strided_buffer_desc_to_ptr_index_add_twice_constant_b
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %buf = call ptr addrspace(9) @lgc.strided.buffer.desc.to.ptr(<4 x i32> %desc)
+  %buf = call ptr addrspace(9) @lgc.strided.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %buf.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %buf, i32 4)
   %buf.idx.2 = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %buf.idx, i32 2)
   %res = load float, ptr addrspace(9) %buf.idx.2, align 4
@@ -154,7 +154,7 @@ define amdgpu_kernel void @strided_buffer_desc_to_ptr_offset_index(<4 x i32> inr
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %buf = call ptr addrspace(9) @lgc.strided.buffer.desc.to.ptr(<4 x i32> %desc)
+  %buf = call ptr addrspace(9) @lgc.strided.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %buf.off = getelementptr inbounds i8, ptr addrspace(9) %buf, i32 8
   %buf.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %buf.off, i32 %index)
   %res = load float, ptr addrspace(9) %buf.idx, align 4
@@ -174,13 +174,13 @@ define float @addr_and_stride_to_ptr(i64 inreg %addr, i32 %stride) {
 ; GFX11-NEXT:    [[TMP5:%.*]] = or i32 [[TMP2]], [[TMP4]]
 ; GFX11-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP5]], i64 1
 ; GFX11-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 -1, i64 2
-; GFX11-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 536956844, i64 3
+; GFX11-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 805392300, i64 3
 ; GFX11-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.struct.buffer.load.i32{{(\.v4i32)?}}(<4 x i32> [[TMP8]], i32 0, i32 0, i32 0, i32 0)
 ; GFX11-NEXT:    [[TMP10:%.*]] = bitcast i32 [[TMP9]] to float
 ; GFX11-NEXT:    ret float [[TMP10]]
 ;
 entry:
-  %buf = call ptr addrspace(9) @lgc.strided.buffer.addr.and.stride.to.ptr(i64 %addr, i32 %stride)
+  %buf = call ptr addrspace(9) @lgc.strided.buffer.addr.and.stride.to.ptr(i64 %addr, i32 %stride, i1 false)
   %res = load float, ptr addrspace(9) %buf, align 4
   ret float %res
 }
@@ -197,13 +197,13 @@ define float @addr_and_stride_to_ptr_index(i64 inreg %addr, i32 inreg %index, i3
 ; GFX11-NEXT:    [[TMP5:%.*]] = or i32 [[TMP2]], [[TMP4]]
 ; GFX11-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP5]], i64 1
 ; GFX11-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 -1, i64 2
-; GFX11-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 536956844, i64 3
+; GFX11-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 805392300, i64 3
 ; GFX11-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.struct.buffer.load.i32{{(\.v4i32)?}}(<4 x i32> [[TMP8]], i32 [[INDEX]], i32 0, i32 0, i32 0)
 ; GFX11-NEXT:    [[TMP10:%.*]] = bitcast i32 [[TMP9]] to float
 ; GFX11-NEXT:    ret float [[TMP10]]
 ;
 entry:
-  %buf = call ptr addrspace(9) @lgc.strided.buffer.addr.and.stride.to.ptr(i64 %addr, i32 %stride)
+  %buf = call ptr addrspace(9) @lgc.strided.buffer.addr.and.stride.to.ptr(i64 %addr, i32 %stride, i1 false)
   %buf.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %buf, i32 %index)
   %res = load float, ptr addrspace(9) %buf.idx, align 4
   ret float %res
@@ -221,13 +221,13 @@ define float @addr_and_stride_to_ptr_index_offset(i64 inreg %addr, i32 inreg %in
 ; GFX11-NEXT:    [[TMP5:%.*]] = or i32 [[TMP2]], [[TMP4]]
 ; GFX11-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP5]], i64 1
 ; GFX11-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 -1, i64 2
-; GFX11-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 536956844, i64 3
+; GFX11-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 805392300, i64 3
 ; GFX11-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.struct.buffer.load.i32{{(\.v4i32)?}}(<4 x i32> [[TMP8]], i32 [[INDEX]], i32 ptrtoint (ptr addrspace(6) getelementptr inbounds (i8, ptr addrspace(6) null, i32 8) to i32), i32 0, i32 0)
 ; GFX11-NEXT:    [[TMP10:%.*]] = bitcast i32 [[TMP9]] to float
 ; GFX11-NEXT:    ret float [[TMP10]]
 ;
 entry:
-  %buf = call ptr addrspace(9) @lgc.strided.buffer.addr.and.stride.to.ptr(i64 %addr, i32 %stride)
+  %buf = call ptr addrspace(9) @lgc.strided.buffer.addr.and.stride.to.ptr(i64 %addr, i32 %stride, i1 false)
   %buf.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %buf, i32 %index)
   %buf.offs = getelementptr inbounds i8, ptr addrspace(9) %buf.idx, i32 8
   %res = load float, ptr addrspace(9) %buf.offs, align 4
@@ -246,13 +246,13 @@ define float @addr_and_stride_to_ptr_offset_index(i64 inreg %addr, i32 inreg %in
 ; GFX11-NEXT:    [[TMP5:%.*]] = or i32 [[TMP2]], [[TMP4]]
 ; GFX11-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP5]], i64 1
 ; GFX11-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 -1, i64 2
-; GFX11-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 536956844, i64 3
+; GFX11-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 805392300, i64 3
 ; GFX11-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.struct.buffer.load.i32{{(\.v4i32)?}}(<4 x i32> [[TMP8]], i32 [[INDEX]], i32 ptrtoint (ptr addrspace(6) getelementptr inbounds (i8, ptr addrspace(6) null, i32 8) to i32), i32 0, i32 0)
 ; GFX11-NEXT:    [[TMP10:%.*]] = bitcast i32 [[TMP9]] to float
 ; GFX11-NEXT:    ret float [[TMP10]]
 ;
 entry:
-  %buf = call ptr addrspace(9) @lgc.strided.buffer.addr.and.stride.to.ptr(i64 %addr, i32 %stride)
+  %buf = call ptr addrspace(9) @lgc.strided.buffer.addr.and.stride.to.ptr(i64 %addr, i32 %stride, i1 false)
   %buf.offs = getelementptr inbounds i8, ptr addrspace(9) %buf, i32 8
   %buf.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %buf.offs, i32 %index)
   %res = load float, ptr addrspace(9) %buf.idx, align 4
@@ -261,7 +261,7 @@ entry:
 
 define amdgpu_kernel void @constant_strided_buffer_desc_to_ptr_index(<4 x i32> inreg %desc, i32 %index, ptr %out) #0 !lgc.shaderstage !4 {
 ; GFX11-LABEL: define amdgpu_gfx void @constant_strided_buffer_desc_to_ptr_index
-; GFX11-SAME: (<4 x i32> inreg [[DESC:%.*]], i32 [[INDEX:%.*]], ptr [[OUT:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR1:[0-9]+]] !lgc.shaderstage [[META6:![0-9]+]] {
+; GFX11-SAME: (<4 x i32> inreg [[DESC:%.*]], i32 [[INDEX:%.*]], ptr [[OUT:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR1:[0-9]+]] !lgc.shaderstage [[META7:![0-9]+]] {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
 ; GFX11-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -278,7 +278,7 @@ define amdgpu_kernel void @constant_strided_buffer_desc_to_ptr_index(<4 x i32> i
 ; GFX11-NEXT:    [[TMP12:%.*]] = or i32 [[TMP10]], 1048576
 ; GFX11-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP12]], i64 1
 ; GFX11-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 -1, i64 2
-; GFX11-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 536956844, i64 3
+; GFX11-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 805392300, i64 3
 ; GFX11-NEXT:    [[TMP16:%.*]] = call i32 @llvm.amdgcn.struct.buffer.load.i32{{(\.v4i32)?}}(<4 x i32> [[TMP15]], i32 24, i32 0, i32 0, i32 0)
 ; GFX11-NEXT:    [[TMP17:%.*]] = bitcast i32 [[TMP16]] to float
 ; GFX11-NEXT:    store float [[TMP17]], ptr [[OUT]], align 4
@@ -295,7 +295,7 @@ entry:
 
 define amdgpu_kernel void @strided_buffer_uniform_strided_load(<4 x i32> %desc, ptr %out) #0 !lgc.shaderstage !4 {
 ; GFX11-LABEL: define amdgpu_gfx void @strided_buffer_uniform_strided_load
-; GFX11-SAME: (<4 x i32> [[DESC:%.*]], ptr [[OUT:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR1]] !lgc.shaderstage [[META6]] {
+; GFX11-SAME: (<4 x i32> [[DESC:%.*]], ptr [[OUT:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR1]] !lgc.shaderstage [[META7]] {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
 ; GFX11-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -312,7 +312,7 @@ define amdgpu_kernel void @strided_buffer_uniform_strided_load(<4 x i32> %desc,
 ; GFX11-NEXT:    [[TMP12:%.*]] = or i32 [[TMP10]], 1048576
 ; GFX11-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP12]], i64 1
 ; GFX11-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 -1, i64 2
-; GFX11-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 536956844, i64 3
+; GFX11-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 805392300, i64 3
 ; GFX11-NEXT:    [[TMP16:%.*]] = call i32 @llvm.amdgcn.struct.buffer.load.i32{{(\.v4i32)?}}(<4 x i32> [[TMP15]], i32 24, i32 0, i32 0, i32 0)
 ; GFX11-NEXT:    [[TMP17:%.*]] = bitcast i32 [[TMP16]] to float
 ; GFX11-NEXT:    store float [[TMP17]], ptr [[OUT]], align 4
@@ -329,7 +329,7 @@ entry:
 
 define amdgpu_kernel void @strided_buffer_convert_uniform_strided_load(<4 x i32> inreg %desc, ptr %out) #0 !lgc.shaderstage !4 {
 ; GFX11-LABEL: define amdgpu_gfx void @strided_buffer_convert_uniform_strided_load
-; GFX11-SAME: (<4 x i32> inreg [[DESC:%.*]], ptr [[OUT:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR1]] !lgc.shaderstage [[META6]] {
+; GFX11-SAME: (<4 x i32> inreg [[DESC:%.*]], ptr [[OUT:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR1]] !lgc.shaderstage [[META7]] {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
 ; GFX11-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -347,13 +347,13 @@ define amdgpu_kernel void @strided_buffer_convert_uniform_strided_load(<4 x i32>
 ; GFX11-NEXT:    [[TMP13:%.*]] = and i32 [[TMP12]], -805306369
 ; GFX11-NEXT:    [[TMP14:%.*]] = or i32 [[TMP13]], 268435456
 ; GFX11-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP14]], i64 3
-; GFX11-NEXT:    [[TMP16:%.*]] = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> [[DESC]], i32 add (i32 ptrtoint (ptr addrspace(6) getelementptr inbounds (i8, ptr addrspace(6) null, i32 8) to i32), i32 288), i32 0), !invariant.load [[META7:![0-9]+]]
+; GFX11-NEXT:    [[TMP16:%.*]] = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> [[DESC]], i32 add (i32 ptrtoint (ptr addrspace(6) getelementptr inbounds (i8, ptr addrspace(6) null, i32 8) to i32), i32 288), i32 0), !invariant.load [[META8:![0-9]+]]
 ; GFX11-NEXT:    [[TMP17:%.*]] = bitcast i32 [[TMP16]] to float
 ; GFX11-NEXT:    store float [[TMP17]], ptr [[OUT]], align 4
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %146 = call ptr @llvm.invariant.start.p7(i64 -1, ptr addrspace(7) %ptr)
   %buf.off = getelementptr inbounds i8, ptr addrspace(7) %ptr, i32 8
   %buf.cnv = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %buf.off, i32 12)
@@ -365,7 +365,7 @@ entry:
 
 define amdgpu_kernel void @strided_buffer_divergent_idx_strided_load(<4 x i32> %desc, i32 %index, ptr %out) #0 !lgc.shaderstage !4  {
 ; GFX11-LABEL: define amdgpu_gfx void @strided_buffer_divergent_idx_strided_load
-; GFX11-SAME: (<4 x i32> [[DESC:%.*]], i32 [[INDEX:%.*]], ptr [[OUT:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR1]] !lgc.shaderstage [[META6]] {
+; GFX11-SAME: (<4 x i32> [[DESC:%.*]], i32 [[INDEX:%.*]], ptr [[OUT:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR1]] !lgc.shaderstage [[META7]] {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
 ; GFX11-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -382,7 +382,7 @@ define amdgpu_kernel void @strided_buffer_divergent_idx_strided_load(<4 x i32> %
 ; GFX11-NEXT:    [[TMP12:%.*]] = or i32 [[TMP10]], 1048576
 ; GFX11-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP12]], i64 1
 ; GFX11-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 -1, i64 2
-; GFX11-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 536956844, i64 3
+; GFX11-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 805392300, i64 3
 ; GFX11-NEXT:    [[TMP16:%.*]] = call i32 @llvm.amdgcn.struct.buffer.load.i32{{(\.v4i32)?}}(<4 x i32> [[TMP15]], i32 [[INDEX]], i32 0, i32 0, i32 0)
 ; GFX11-NEXT:    [[TMP17:%.*]] = bitcast i32 [[TMP16]] to float
 ; GFX11-NEXT:    store float [[TMP17]], ptr [[OUT]], align 4
@@ -399,7 +399,7 @@ entry:
 
 define amdgpu_kernel void @strided_buffer_convert_divergent_idx_strided_load(<4 x i32> inreg %desc, i32 %index, ptr %out) #0 !lgc.shaderstage !4  {
 ; GFX11-LABEL: define amdgpu_gfx void @strided_buffer_convert_divergent_idx_strided_load
-; GFX11-SAME: (<4 x i32> inreg [[DESC:%.*]], i32 [[INDEX:%.*]], ptr [[OUT:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR1]] !lgc.shaderstage [[META6]] {
+; GFX11-SAME: (<4 x i32> inreg [[DESC:%.*]], i32 [[INDEX:%.*]], ptr [[OUT:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR1]] !lgc.shaderstage [[META7]] {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
 ; GFX11-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -423,7 +423,7 @@ define amdgpu_kernel void @strided_buffer_convert_divergent_idx_strided_load(<4
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %146 = call ptr @llvm.invariant.start.p7(i64 -1, ptr addrspace(7) %ptr)
   %buf.off = getelementptr inbounds i8, ptr addrspace(7) %ptr, i32 8
   %buf.cnv = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %buf.off, i32 12)
@@ -435,7 +435,7 @@ entry:
 
 define amdgpu_kernel void @strided_buffer_divergent_ptr_strided_load(<4 x i32> %desc, ptr %out) #0 !lgc.shaderstage !4  {
 ; GFX11-LABEL: define amdgpu_gfx void @strided_buffer_divergent_ptr_strided_load
-; GFX11-SAME: (<4 x i32> [[DESC:%.*]], ptr [[OUT:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR1]] !lgc.shaderstage [[META6]] {
+; GFX11-SAME: (<4 x i32> [[DESC:%.*]], ptr [[OUT:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR1]] !lgc.shaderstage [[META7]] {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
 ; GFX11-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -452,7 +452,7 @@ define amdgpu_kernel void @strided_buffer_divergent_ptr_strided_load(<4 x i32> %
 ; GFX11-NEXT:    [[TMP12:%.*]] = or i32 [[TMP10]], 1048576
 ; GFX11-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP12]], i64 1
 ; GFX11-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 -1, i64 2
-; GFX11-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 536956844, i64 3
+; GFX11-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 805392300, i64 3
 ; GFX11-NEXT:    [[TMP16:%.*]] = call i32 @llvm.amdgcn.struct.buffer.load.i32{{(\.v4i32)?}}(<4 x i32> [[TMP15]], i32 24, i32 0, i32 0, i32 0)
 ; GFX11-NEXT:    [[TMP17:%.*]] = bitcast i32 [[TMP16]] to float
 ; GFX11-NEXT:    store float [[TMP17]], ptr [[OUT]], align 4
@@ -468,7 +468,7 @@ entry:
 
 define amdgpu_kernel void @strided_buffer_convert_divergent_ptr_strided_load(<4 x i32> %desc, i32 inreg %index, ptr %out) #0 !lgc.shaderstage !4  {
 ; GFX11-LABEL: define amdgpu_gfx void @strided_buffer_convert_divergent_ptr_strided_load
-; GFX11-SAME: (<4 x i32> [[DESC:%.*]], i32 inreg [[INDEX:%.*]], ptr [[OUT:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR1]] !lgc.shaderstage [[META6]] {
+; GFX11-SAME: (<4 x i32> [[DESC:%.*]], i32 inreg [[INDEX:%.*]], ptr [[OUT:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR1]] !lgc.shaderstage [[META7]] {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
 ; GFX11-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -505,7 +505,7 @@ define amdgpu_kernel void @strided_buffer_convert_divergent_ptr_strided_load(<4
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %buf.off = getelementptr inbounds i8, ptr addrspace(7) %ptr, i32 8
   %buf.cnv = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %buf.off, i32 12)
   %buf.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %buf.cnv, i32 24)
@@ -516,7 +516,7 @@ entry:
 
 define amdgpu_kernel void @strided_buffer_divergent_strided_load(<4 x i32> %desc, i32 %index, ptr %out) #0 !lgc.shaderstage !4  {
 ; GFX11-LABEL: define amdgpu_gfx void @strided_buffer_divergent_strided_load
-; GFX11-SAME: (<4 x i32> [[DESC:%.*]], i32 [[INDEX:%.*]], ptr [[OUT:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR1]] !lgc.shaderstage [[META6]] {
+; GFX11-SAME: (<4 x i32> [[DESC:%.*]], i32 [[INDEX:%.*]], ptr [[OUT:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR1]] !lgc.shaderstage [[META7]] {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
 ; GFX11-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -533,7 +533,7 @@ define amdgpu_kernel void @strided_buffer_divergent_strided_load(<4 x i32> %desc
 ; GFX11-NEXT:    [[TMP12:%.*]] = or i32 [[TMP10]], 1048576
 ; GFX11-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP12]], i64 1
 ; GFX11-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 -1, i64 2
-; GFX11-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 536956844, i64 3
+; GFX11-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 805392300, i64 3
 ; GFX11-NEXT:    [[TMP16:%.*]] = call i32 @llvm.amdgcn.struct.buffer.load.i32{{(\.v4i32)?}}(<4 x i32> [[TMP15]], i32 [[INDEX]], i32 0, i32 0, i32 0)
 ; GFX11-NEXT:    [[TMP17:%.*]] = bitcast i32 [[TMP16]] to float
 ; GFX11-NEXT:    store float [[TMP17]], ptr [[OUT]], align 4
@@ -549,7 +549,7 @@ entry:
 
 define amdgpu_kernel void @strided_buffer_convert_divergent_strided_load(<4 x i32> %desc, i32 %index, ptr %out) #0 !lgc.shaderstage !4  {
 ; GFX11-LABEL: define amdgpu_gfx void @strided_buffer_convert_divergent_strided_load
-; GFX11-SAME: (<4 x i32> [[DESC:%.*]], i32 [[INDEX:%.*]], ptr [[OUT:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR1]] !lgc.shaderstage [[META6]] {
+; GFX11-SAME: (<4 x i32> [[DESC:%.*]], i32 [[INDEX:%.*]], ptr [[OUT:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR1]] !lgc.shaderstage [[META7]] {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
 ; GFX11-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -586,7 +586,7 @@ define amdgpu_kernel void @strided_buffer_convert_divergent_strided_load(<4 x i3
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %buf.off = getelementptr inbounds i8, ptr addrspace(7) %ptr, i32 8
   %buf.cnv = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %buf.off, i32 12)
   %buf.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %buf.cnv, i32 %index)
@@ -596,10 +596,10 @@ entry:
 }
 
 ; Function Attrs: nounwind willreturn memory(none)
-declare ptr addrspace(9) @lgc.strided.buffer.desc.to.ptr(<4 x i32>) #0
+declare ptr addrspace(9) @lgc.strided.buffer.desc.to.ptr(<4 x i32>, i1) #0
 
 ; Function Attrs: nounwind willreturn memory(none)
-declare ptr addrspace(9) @lgc.strided.buffer.addr.and.stride.to.ptr(i64, i32) #0
+declare ptr addrspace(9) @lgc.strided.buffer.addr.and.stride.to.ptr(i64, i32, i1) #0
 
 ; Function Attrs: nounwind willreturn memory(none)
 declare ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9), i32) #0
@@ -609,9 +609,9 @@ declare ptr addrspace(9) @lgc.load.strided.buffer.desc(i64, i32, i32, i32, i32)
 
 declare ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7), i32)
 
-declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>) nounwind readnone
+declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>, i1) nounwind readnone
 
-declare ptr addrspace(7) @lgc.buffer.load.desc.to.ptr(ptr addrspace(4), i1, i1) nounwind readnone
+declare ptr addrspace(7) @lgc.buffer.load.desc.to.ptr(ptr addrspace(4), i1, i1, i1) nounwind readnone
 
 ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
 declare ptr @llvm.invariant.start.p9(i64 immarg, ptr addrspace(9) nocapture) #1
diff --git a/lgc/test/Transforms/PatchBufferOp/uniform-phi.lgc b/lgc/test/Transforms/LowerBufferOperations/uniform-phi.lgc
similarity index 91%
rename from lgc/test/Transforms/PatchBufferOp/uniform-phi.lgc
rename to lgc/test/Transforms/LowerBufferOperations/uniform-phi.lgc
index 103daa3696..f9838fe387 100644
--- a/lgc/test/Transforms/PatchBufferOp/uniform-phi.lgc
+++ b/lgc/test/Transforms/LowerBufferOperations/uniform-phi.lgc
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc
-; RUN: lgc -o - -passes='require<lgc-pipeline-state>,function(lgc-patch-buffer-op)' %s | FileCheck --check-prefixes=CHECK %s
+; RUN: lgc -o - -passes='require<lgc-pipeline-state>,function(lgc-lower-buffer-operations)' %s | FileCheck --check-prefixes=CHECK %s
 
 ; TODO: This one is too pessimistic: the descriptor is really uniform but we
 ;       can't know this because the initial divergence analysis treats
@@ -30,11 +30,11 @@ define amdgpu_gfx float @uniform_phi(<4 x i32> inreg %desc0, <4 x i32> inreg %de
   br i1 %sel, label %a, label %b
 
 a:
-  %ptr0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0)
+  %ptr0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0, i1 false)
   br label %tail
 
 b:
-  %ptr1 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc1)
+  %ptr1 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc1, i1 false)
   br label %tail
 
 tail:
@@ -43,6 +43,6 @@ tail:
   ret float %r
 }
 
-declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>) nounwind readnone
+declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>, i1) nounwind readnone
 
 !0 = !{i32 7}
diff --git a/lgc/test/Transforms/LowerCooperativeMatrix/gfx1010muladd.lgc b/lgc/test/Transforms/LowerCooperativeMatrix/gfx1010muladd.lgc
index 3738960bb8..6aad0b526f 100644
--- a/lgc/test/Transforms/LowerCooperativeMatrix/gfx1010muladd.lgc
+++ b/lgc/test/Transforms/LowerCooperativeMatrix/gfx1010muladd.lgc
@@ -2,20 +2,20 @@
 ; RUN: lgc -march=amdgcn -o - --mcpu=gfx1010 -filetype=asm %s | FileCheck -check-prefixes=CHECK %s
 
 define void @matmul_f16f32_emulator(ptr addrspace(3) %out0, <8 x float> %a, <8 x float> %b, <8 x float> %c0) !lgc.shaderstage !0 {
-  %value = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f8(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 2)
-  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %value)
+  %value = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f8(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 2, i32 2, i32 1)
+  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %value, i32 16)
   ret void
 }
 
 define void @matmul_i16i32_emulator(ptr addrspace(3) %out0, <8 x i32> %a, <8 x i32> %b, <8 x i32> %c0) !lgc.shaderstage !0 {
-  %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c0, i1 true, i1 true, i1 false, i1 false, i32 4, i32 4, i32 5)
-  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x i32> %value)
+  %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c0, i1 true, i1 true, i1 false, i1 false, i32 4, i32 4, i32 5, i32 5, i32 1)
+  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x i32> %value, i32 16)
   ret void
 }
 
 define void @matmul_i8i32_emulator(ptr addrspace(3) %out0, <8 x i32> %a, <8 x i32> %b, <8 x i32> %c0) !lgc.shaderstage !0 {
-  %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c0, i1 true, i1 true, i1 false, i1 false, i32 3, i32 3, i32 5)
-  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x i32> %value)
+  %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c0, i1 true, i1 true, i1 false, i1 false, i32 3, i32 3, i32 5, i32 5, i32 1)
+  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x i32> %value, i32 16)
   ret void
 }
 
diff --git a/lgc/test/Transforms/LowerCooperativeMatrix/gfx1011muladd.lgc b/lgc/test/Transforms/LowerCooperativeMatrix/gfx1011muladd.lgc
index c1aca85e3d..6be3cb0a23 100644
--- a/lgc/test/Transforms/LowerCooperativeMatrix/gfx1011muladd.lgc
+++ b/lgc/test/Transforms/LowerCooperativeMatrix/gfx1011muladd.lgc
@@ -2,20 +2,20 @@
 ; RUN: lgc -march=amdgcn -o - --mcpu=gfx1011 -filetype=asm %s | FileCheck -check-prefixes=CHECK %s
 
 define void @matmul_f16f32_emulator(ptr addrspace(3) %out0, <8 x float> %a, <8 x float> %b, <8 x float> %c0) !lgc.shaderstage !0 {
-  %value = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f8(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 2)
-  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %value)
+  %value = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f8(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1, i32 2, i32 2, i32 1)
+  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %value, i32 16)
   ret void
 }
 
 define void @matmul_i16i32_emulator(ptr addrspace(3) %out0, <8 x i32> %a, <8 x i32> %b, <8 x i32> %c0) !lgc.shaderstage !0 {
-  %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c0, i1 true, i1 true, i1 false, i1 false, i32 4, i32 4, i32 5)
-  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x i32> %value)
+  %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c0, i1 true, i1 true, i1 false, i1 false, i32 4, i32 4, i32 5, i32 5, i32 1)
+  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x i32> %value, i32 16)
   ret void
 }
 
 define void @matmul_i8i32_emulator(ptr addrspace(3) %out0, <8 x i32> %a, <8 x i32> %b, <8 x i32> %c0) !lgc.shaderstage !0 {
-  %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c0, i1 true, i1 true, i1 false, i1 false, i32 3, i32 3, i32 5)
-  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x i32> %value)
+  %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c0, i1 true, i1 true, i1 false, i1 false, i32 3, i32 3, i32 5, i32 5, i32 1)
+  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x i32> %value, i32 16)
   ret void
 }
 
diff --git a/lgc/test/Transforms/LowerCooperativeMatrix/bf16muladd.lgc b/lgc/test/Transforms/LowerCooperativeMatrix/gfx1100muladd.lgc
similarity index 69%
rename from lgc/test/Transforms/LowerCooperativeMatrix/bf16muladd.lgc
rename to lgc/test/Transforms/LowerCooperativeMatrix/gfx1100muladd.lgc
index 99b85f4344..26b6c31996 100644
--- a/lgc/test/Transforms/LowerCooperativeMatrix/bf16muladd.lgc
+++ b/lgc/test/Transforms/LowerCooperativeMatrix/gfx1100muladd.lgc
@@ -11,7 +11,7 @@ define <8 x i32> @muladd_bf16_bf16(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i16> [[VALUE1]] to <8 x i32>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP4]]
 ;
-  %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, i1 false, i1 false, i1 false, i1 false, i32 7, i32 7, i32 7)
+  %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, i1 false, i1 false, i1 false, i1 false, i32 7, i32 7, i32 7, i32 7, i32 1)
   ret <8 x i32> %value
 }
 
@@ -23,9 +23,20 @@ define <8 x float> @muladd_bf16_f32(<8 x i32> %a, <8 x i32> %b, <8 x float> %c)
 ; CHECK-NEXT:    [[VALUE1:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]], <8 x float> [[C]])
 ; CHECK-NEXT:    ret <8 x float> [[VALUE1]]
 ;
-  %value = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x i32> %a, <8 x i32> %b, <8 x float> %c, i1 false, i1 false, i1 false, i1 false, i32 7, i32 7, i32 2)
+  %value = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x i32> %a, <8 x i32> %b, <8 x float> %c, i1 false, i1 false, i1 false, i1 false, i32 7, i32 7, i32 2, i32 2, i32 1)
   ret <8 x float> %value
 }
 
+define <4 x i32> @muladd_16x16x16_iu4(<2 x i32> %a, <2 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: define <4 x i32> @muladd_16x16x16_iu4(
+; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) {
+; CHECK-NEXT:    [[MULADD1:%.*]] = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 false, <2 x i32> [[A]], i1 false, <2 x i32> [[B]], <4 x i32> [[C]], i1 false)
+; CHECK-NEXT:    ret <4 x i32> [[MULADD1]]
+;
+  %mulAdd = call <4 x i32> (...) @lgc.cooperative.matrix.muladd__v4i32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %c, i1 false, i1 false, i1 false, i1 false, i32 10, i32 10, i32 5, i32 5, i32 1)
+  ret <4 x i32> %mulAdd
+}
+
 declare <8 x i32> @lgc.cooperative.matrix.muladd__v8i32(...)
 declare <8 x float> @lgc.cooperative.matrix.muladd__v8f32(...)
+declare <4 x i32> @lgc.cooperative.matrix.muladd__v4i32(...)
diff --git a/lgc/test/Transforms/LowerCooperativeMatrix/load-wave64.lgc b/lgc/test/Transforms/LowerCooperativeMatrix/load-wave64.lgc
index 53ede1663b..bc9768d7e4 100644
--- a/lgc/test/Transforms/LowerCooperativeMatrix/load-wave64.lgc
+++ b/lgc/test/Transforms/LowerCooperativeMatrix/load-wave64.lgc
@@ -74,7 +74,7 @@ define <8 x float> @test_f16_ab_layout(ptr addrspace(7) %ptr) !spirv.ExecutionMo
 ; CHECK-NEXT:    [[TMP69:%.*]] = bitcast <16 x half> [[TMP68]] to <8 x float>
 ; CHECK-NEXT:    ret <8 x float> [[TMP69]]
 ;
-  %a = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) %ptr, i32 320, i1 false, i32 1, i32 0, i32 0, i32 16)
+  %a = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) %ptr, i32 320, i1 false, i32 1, i32 0, i32 0, i32 16, i32 16)
   ret <8 x float> %a
 }
 
@@ -106,7 +106,7 @@ define <8 x float> @test_f16_cd_layout(ptr addrspace(7) %ptr) !spirv.ExecutionMo
 ; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x half> [[TMP23]] to <8 x float>
 ; CHECK-NEXT:    ret <8 x float> [[TMP24]]
 ;
-  %a = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) %ptr, i32 320, i1 false, i32 1, i32 1, i32 0, i32 16)
+  %a = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) %ptr, i32 320, i1 false, i32 1, i32 1, i32 0, i32 16, i32 16)
   ret <8 x float> %a
 }
 
@@ -183,7 +183,7 @@ define <8 x i32> @test_i16_ab_layout(ptr addrspace(7) %ptr) !spirv.ExecutionMode
 ; CHECK-NEXT:    [[TMP69:%.*]] = bitcast <16 x i16> [[TMP68]] to <8 x i32>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP69]]
 ;
-  %a = call <8 x i32> (...) @lgc.cooperative.matrix.load__v8i32(ptr addrspace(7) %ptr, i32 320, i1 false, i32 4, i32 0, i32 0, i32 16)
+  %a = call <8 x i32> (...) @lgc.cooperative.matrix.load__v8i32(ptr addrspace(7) %ptr, i32 320, i1 false, i32 4, i32 0, i32 0, i32 16, i32 16)
   ret <8 x i32> %a
 }
 
@@ -215,7 +215,7 @@ define <8 x i32> @test_i16_cd_layout(ptr addrspace(7) %ptr) !spirv.ExecutionMode
 ; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x i16> [[TMP23]] to <8 x i32>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP24]]
 ;
-  %a = call <8 x i32> (...) @lgc.cooperative.matrix.load__v8i32(ptr addrspace(7) %ptr, i32 320, i1 false, i32 4, i32 1, i32 0, i32 16)
+  %a = call <8 x i32> (...) @lgc.cooperative.matrix.load__v8i32(ptr addrspace(7) %ptr, i32 320, i1 false, i32 4, i32 1, i32 0, i32 16, i32 16)
   ret <8 x i32> %a
 }
 
@@ -246,7 +246,7 @@ define <8 x float> @test_f32_cd_layout(ptr addrspace(7) %ptr) !spirv.ExecutionMo
 ; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x float> [[TMP22]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x float> [[TMP23]]
 ;
-  %a = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) %ptr, i32 640, i1 false, i32 2, i32 1, i32 0, i32 16)
+  %a = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) %ptr, i32 640, i1 false, i32 2, i32 1, i32 0, i32 16, i32 16)
   ret <8 x float> %a
 }
 
@@ -277,7 +277,7 @@ define <8 x i32> @test_i32_cd_layout(ptr addrspace(7) %ptr) !spirv.ExecutionMode
 ; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i32> [[TMP22]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP23]]
 ;
-  %a = call <8 x i32> (...) @lgc.cooperative.matrix.load__v8i32(ptr addrspace(7) %ptr, i32 640, i1 false, i32 5, i32 1, i32 0, i32 16)
+  %a = call <8 x i32> (...) @lgc.cooperative.matrix.load__v8i32(ptr addrspace(7) %ptr, i32 640, i1 false, i32 5, i32 1, i32 0, i32 16, i32 16)
   ret <8 x i32> %a
 }
 
@@ -323,9 +323,10 @@ define <2 x i32> @test_i4_ab_layout(ptr addrspace(7) %ptr) !spirv.ExecutionModel
 ; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i8> [[TMP37]] to <2 x i32>
 ; CHECK-NEXT:    ret <2 x i32> [[TMP38]]
 ;
-  %a = call <2 x i32> (...) @lgc.cooperative.matrix.load__v2i32(ptr addrspace(7) %ptr, i32 128, i1 true, i32 10, i32 0, i32 2, i32 16)
+  %a = call <2 x i32> (...) @lgc.cooperative.matrix.load__v2i32(ptr addrspace(7) %ptr, i32 128, i1 true, i32 10, i32 0, i32 2, i32 16, i32 16)
   ret <2 x i32> %a
 }
+
 declare <8 x float> @lgc.cooperative.matrix.load__v8f32(...)
 declare <8 x i32> @lgc.cooperative.matrix.load__v8i32(...)
 declare <2 x i32> @lgc.cooperative.matrix.load__v2i32(...)
diff --git a/lgc/test/Transforms/LowerCooperativeMatrix/store-wave64.lgc b/lgc/test/Transforms/LowerCooperativeMatrix/store-wave64.lgc
index 64c888a45a..f3b781b646 100644
--- a/lgc/test/Transforms/LowerCooperativeMatrix/store-wave64.lgc
+++ b/lgc/test/Transforms/LowerCooperativeMatrix/store-wave64.lgc
@@ -74,7 +74,7 @@ define void @test_f16_ab_layout(ptr addrspace(7) %ptr, <8 x float> %a) !spirv.Ex
 ; CHECK-NEXT:    store half [[TMP53]], ptr addrspace(7) [[TMP52]], align 2
 ; CHECK-NEXT:    ret void
 ;
-  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) %ptr, i32 320, i1 false, i32 1, i32 0, i32 0, i32 16, <8 x float> %a)
+  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) %ptr, i32 320, i1 false, i32 1, i32 0, i32 0, i32 16, <8 x float> %a, i32 16)
   ret void
 }
 
@@ -106,7 +106,7 @@ define void @test_f16_cd_layout(ptr addrspace(7) %ptr, <8 x float> %a) !spirv.Ex
 ; CHECK-NEXT:    store half [[TMP20]], ptr addrspace(7) [[TMP19]], align 2
 ; CHECK-NEXT:    ret void
 ;
-  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) %ptr, i32 320, i1 false, i32 1, i32 1, i32 0, i32 16, <8 x float> %a)
+  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) %ptr, i32 320, i1 false, i32 1, i32 1, i32 0, i32 16, <8 x float> %a, i32 16)
   ret void
 }
 
@@ -183,7 +183,7 @@ define void @test_i16_ab_layout(ptr addrspace(7) %ptr, <8 x i32> %a) !spirv.Exec
 ; CHECK-NEXT:    store i16 [[TMP53]], ptr addrspace(7) [[TMP52]], align 2
 ; CHECK-NEXT:    ret void
 ;
-  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) %ptr, i32 320, i1 false, i32 4, i32 0, i32 0, i32 16, <8 x i32> %a)
+  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) %ptr, i32 320, i1 false, i32 4, i32 0, i32 0, i32 16, <8 x i32> %a, i32 16)
   ret void
 }
 
@@ -215,7 +215,7 @@ define void @test_i16_cd_layout(ptr addrspace(7) %ptr, <8 x i32> %a) !spirv.Exec
 ; CHECK-NEXT:    store i16 [[TMP20]], ptr addrspace(7) [[TMP19]], align 2
 ; CHECK-NEXT:    ret void
 ;
-  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) %ptr, i32 320, i1 false, i32 4, i32 1, i32 0, i32 16, <8 x i32> %a)
+  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) %ptr, i32 320, i1 false, i32 4, i32 1, i32 0, i32 16, <8 x i32> %a, i32 16)
   ret void
 }
 
@@ -246,7 +246,7 @@ define void @test_f32_cd_layout(ptr addrspace(7) %ptr, <8 x float> %a) !spirv.Ex
 ; CHECK-NEXT:    store float [[TMP19]], ptr addrspace(7) [[TMP18]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) %ptr, i32 640, i1 false, i32 2, i32 1, i32 0, i32 16, <8 x float> %a)
+  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) %ptr, i32 640, i1 false, i32 2, i32 1, i32 0, i32 16, <8 x float> %a, i32 16)
   ret void
 }
 
@@ -277,7 +277,7 @@ define void @test_i32_cd_layout(ptr addrspace(7) %ptr, <8 x i32> %a) !spirv.Exec
 ; CHECK-NEXT:    store i32 [[TMP19]], ptr addrspace(7) [[TMP18]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) %ptr, i32 640, i1 false, i32 5, i32 1, i32 0, i32 16, <8 x i32> %a)
+  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) %ptr, i32 640, i1 false, i32 5, i32 1, i32 0, i32 16, <8 x i32> %a, i32 16)
   ret void
 }
 
diff --git a/lgc/test/Transforms/LowerDebugPrintf/basic.lgc b/lgc/test/Transforms/LowerDebugPrintf/basic.lgc
index 817cb273b0..9c879a10d0 100644
--- a/lgc/test/Transforms/LowerDebugPrintf/basic.lgc
+++ b/lgc/test/Transforms/LowerDebugPrintf/basic.lgc
@@ -12,7 +12,7 @@ define spir_func void @simple() !lgc.shaderstage !0 {
 
 !lgc.user.data.nodes = !{!4, !5}
 !4 = !{!"DescriptorTableVaPtr", i32 7, i32 0, i32 0, i32 1, i32 1}
-!5 = !{!"DescriptorBuffer", i32 6, i32 0, i32 0, i32 4, i32 -1, i32 6, i32 4}
+!5 = !{!"DescriptorBuffer", i32 6, i32 0, i32 0, i32 4, i32 -16, i32 6, i32 4}
 
 ; IR: !amdgpu.pal.metadata.msgpack =
 
diff --git a/lgc/test/Transforms/PeepholeOpt/PeepholeOptLog2PowUnderflow.lgc b/lgc/test/Transforms/PeepholeOpt/PeepholeOptLog2PowUnderflow.lgc
new file mode 100644
index 0000000000..a95dad17f3
--- /dev/null
+++ b/lgc/test/Transforms/PeepholeOpt/PeepholeOptLog2PowUnderflow.lgc
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 2
+; RUN: lgc -mcpu=gfx1010 -passes='require<lgc-pipeline-state>,function(lgc-peephole-optimization)' -o - %s | FileCheck --check-prefixes=CHECK %s
+
+; Test that log2 and pow underflow transform is correctly applied.
+
+target triple = "amdgcn--amdpal"
+
+define dllexport spir_func float @lgc.shader.FS.main.log2(float %x, float %y, float %z) local_unnamed_addr #0 !lgc.shaderstage !0 {
+; CHECK-LABEL: define dllexport spir_func float @lgc.shader.FS.main.log2
+; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !lgc.shaderstage [[META0:![0-9]+]] {
+; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    [[MUL0A:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[X]], 0x3F9C71C720000000
+; CHECK-NEXT:    [[SUB0A:%.*]] = fsub reassoc nnan nsz arcp contract afn float 1.000000e+00, [[MUL0A]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.maxnum.f32(float 0.000000e+00, float [[SUB0A]])
+; CHECK-NEXT:    [[LOG0A:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.log2.f32(float [[TMP0]])
+; CHECK-NEXT:    [[MUL0B:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[X]], 0x3F9C71C720000000
+; CHECK-NEXT:    [[SUB0B:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[MUL0B]], 2.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.maxnum.f32(float 0.000000e+00, float [[SUB0B]])
+; CHECK-NEXT:    [[LOG0B:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.log2.f32(float [[TMP1]])
+; CHECK-NEXT:    [[MUL1:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[Y]], 0x3F9C71C720000000
+; CHECK-NEXT:    [[SUB1:%.*]] = fsub reassoc nsz arcp contract afn float 1.000000e+00, [[MUL1]]
+; CHECK-NEXT:    [[LOG1:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.log2.f32(float [[SUB1]])
+; CHECK-NEXT:    [[MUL2A:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[Z]], 0x3F9C71C720000000
+; CHECK-NEXT:    [[ADD2A:%.*]] = fadd reassoc nnan nsz arcp contract afn float 1.000000e+00, [[MUL2A]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.maxnum.f32(float 0.000000e+00, float [[ADD2A]])
+; CHECK-NEXT:    [[LOG2A:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.log2.f32(float [[TMP2]])
+; CHECK-NEXT:    [[MUL2B:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[Z]], 0x3F9C71C720000000
+; CHECK-NEXT:    [[ADD2B:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[MUL2B]], 2.000000e+00
+; CHECK-NEXT:    [[TMP3:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.maxnum.f32(float 0.000000e+00, float [[ADD2B]])
+; CHECK-NEXT:    [[LOG2B:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.log2.f32(float [[TMP3]])
+; CHECK-NEXT:    [[RET_0:%.*]] = fadd float [[LOG0A]], [[LOG0B]]
+; CHECK-NEXT:    [[RET_1:%.*]] = fadd float [[RET_0]], [[LOG1]]
+; CHECK-NEXT:    [[RET_2:%.*]] = fadd float [[RET_1]], [[LOG2A]]
+; CHECK-NEXT:    [[RET_3:%.*]] = fadd float [[RET_2]], [[LOG2B]]
+; CHECK-NEXT:    ret float [[RET_3]]
+;
+.entry:
+  %mul0a = fmul reassoc nnan nsz arcp contract afn float %x, 0x3F9C71C720000000
+  %sub0a = fsub reassoc nnan nsz arcp contract afn float 1.000000e+00, %mul0a
+  %log0a = call reassoc nnan nsz arcp contract afn float @llvm.log2.f32(float %sub0a)
+
+  %mul0b = fmul reassoc nnan nsz arcp contract afn float %x, 0x3F9C71C720000000
+  %sub0b = fsub reassoc nnan nsz arcp contract afn float %mul0b, 2.000000e+00
+  %log0b = call reassoc nnan nsz arcp contract afn float @llvm.log2.f32(float %sub0b)
+
+  %mul1 = fmul reassoc nnan nsz arcp contract afn float %y, 0x3F9C71C720000000
+  %sub1 = fsub reassoc nsz arcp contract afn float 1.000000e+00, %mul1
+  %log1 = call reassoc nnan nsz arcp contract afn float @llvm.log2.f32(float %sub1)
+
+  %mul2a = fmul reassoc nnan nsz arcp contract afn float %z, 0x3F9C71C720000000
+  %add2a = fadd reassoc nnan nsz arcp contract afn float 1.000000e+00, %mul2a
+  %log2a = call reassoc nnan nsz arcp contract afn float @llvm.log2.f32(float %add2a)
+
+  %mul2b = fmul reassoc nnan nsz arcp contract afn float %z, 0x3F9C71C720000000
+  %add2b = fadd reassoc nnan nsz arcp contract afn float %mul2b, 2.000000e+00
+  %log2b = call reassoc nnan nsz arcp contract afn float @llvm.log2.f32(float %add2b)
+
+  %ret.0 = fadd float %log0a, %log0b
+  %ret.1 = fadd float %ret.0, %log1
+  %ret.2 = fadd float %ret.1, %log2a
+  %ret.3 = fadd float %ret.2, %log2b
+  ret float %ret.3
+}
+
+define dllexport spir_func float @lgc.shader.FS.main.pow(float %x, float %y, float %z) local_unnamed_addr #0 !lgc.shaderstage !0 {
+; CHECK-LABEL: define dllexport spir_func float @lgc.shader.FS.main.pow
+; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]]) local_unnamed_addr #[[ATTR0]] !lgc.shaderstage [[META0]] {
+; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    [[MUL0A:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[X]], 0x3F9C71C720000000
+; CHECK-NEXT:    [[SUB0A:%.*]] = fsub reassoc nnan nsz arcp contract afn float 1.000000e+00, [[MUL0A]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.maxnum.f32(float 0.000000e+00, float [[SUB0A]])
+; CHECK-NEXT:    [[LOG0A:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.pow.f32(float [[TMP0]], float 2.000000e+00)
+; CHECK-NEXT:    [[MUL0B:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[X]], 0x3F9C71C720000000
+; CHECK-NEXT:    [[SUB0B:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[MUL0B]], 2.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.maxnum.f32(float 0.000000e+00, float [[SUB0B]])
+; CHECK-NEXT:    [[LOG0B:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.pow.f32(float [[TMP1]], float 2.000000e+00)
+; CHECK-NEXT:    [[MUL1:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[Y]], 0x3F9C71C720000000
+; CHECK-NEXT:    [[SUB1:%.*]] = fsub reassoc nsz arcp contract afn float 1.000000e+00, [[MUL1]]
+; CHECK-NEXT:    [[LOG1:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.pow.f32(float [[SUB1]], float 2.000000e+00)
+; CHECK-NEXT:    [[MUL2A:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[Z]], 0x3F9C71C720000000
+; CHECK-NEXT:    [[ADD2A:%.*]] = fadd reassoc nnan nsz arcp contract afn float 1.000000e+00, [[MUL2A]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.maxnum.f32(float 0.000000e+00, float [[ADD2A]])
+; CHECK-NEXT:    [[LOG2A:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.pow.f32(float [[TMP2]], float 2.000000e+00)
+; CHECK-NEXT:    [[MUL2B:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[Z]], 0x3F9C71C720000000
+; CHECK-NEXT:    [[ADD2B:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[MUL2B]], 2.000000e+00
+; CHECK-NEXT:    [[TMP3:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.maxnum.f32(float 0.000000e+00, float [[ADD2B]])
+; CHECK-NEXT:    [[LOG2B:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.pow.f32(float [[TMP3]], float 2.000000e+00)
+; CHECK-NEXT:    [[RET_0:%.*]] = fadd float [[LOG0A]], [[LOG0B]]
+; CHECK-NEXT:    [[RET_1:%.*]] = fadd float [[RET_0]], [[LOG1]]
+; CHECK-NEXT:    [[RET_2:%.*]] = fadd float [[RET_1]], [[LOG2A]]
+; CHECK-NEXT:    [[RET_3:%.*]] = fadd float [[RET_2]], [[LOG2B]]
+; CHECK-NEXT:    ret float [[RET_3]]
+;
+.entry:
+  %mul0a = fmul reassoc nnan nsz arcp contract afn float %x, 0x3F9C71C720000000
+  %sub0a = fsub reassoc nnan nsz arcp contract afn float 1.000000e+00, %mul0a
+  %log0a = call reassoc nnan nsz arcp contract afn float @llvm.pow.f32(float %sub0a, float 2.0)
+
+  %mul0b = fmul reassoc nnan nsz arcp contract afn float %x, 0x3F9C71C720000000
+  %sub0b = fsub reassoc nnan nsz arcp contract afn float %mul0b, 2.000000e+00
+  %log0b = call reassoc nnan nsz arcp contract afn float @llvm.pow.f32(float %sub0b, float 2.0)
+
+  %mul1 = fmul reassoc nnan nsz arcp contract afn float %y, 0x3F9C71C720000000
+  %sub1 = fsub reassoc nsz arcp contract afn float 1.000000e+00, %mul1
+  %log1 = call reassoc nnan nsz arcp contract afn float @llvm.pow.f32(float %sub1, float 2.0)
+
+  %mul2a = fmul reassoc nnan nsz arcp contract afn float %z, 0x3F9C71C720000000
+  %add2a = fadd reassoc nnan nsz arcp contract afn float 1.000000e+00, %mul2a
+  %log2a = call reassoc nnan nsz arcp contract afn float @llvm.pow.f32(float %add2a, float 2.0)
+
+  %mul2b = fmul reassoc nnan nsz arcp contract afn float %z, 0x3F9C71C720000000
+  %add2b = fadd reassoc nnan nsz arcp contract afn float %mul2b, 2.000000e+00
+  %log2b = call reassoc nnan nsz arcp contract afn float @llvm.pow.f32(float %add2b, float 2.0)
+
+  %ret.0 = fadd float %log0a, %log0b
+  %ret.1 = fadd float %ret.0, %log1
+  %ret.2 = fadd float %ret.1, %log2a
+  %ret.3 = fadd float %ret.2, %log2b
+  ret float %ret.3
+}
+
+declare float @llvm.log2.f32(float) #0
+declare float @llvm.pow.f32(float, float) #0
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+
+!lgc.options = !{!1}
+!lgc.options.FS = !{!2}
+
+!0 = !{i32 6}
+!1 = !{i32 -1450315588, i32 -1820538735, i32 732113874, i32 -67578670, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 2, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 256 }
+!2 = !{i32 -1310089663, i32 1186545208, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16777216 }
diff --git a/lgc/test/Transforms/PeepholeOpt/PeepholeOptLog2Underflow.lgc b/lgc/test/Transforms/PeepholeOpt/PeepholeOptLog2Underflow.lgc
deleted file mode 100644
index 2aa88733b7..0000000000
--- a/lgc/test/Transforms/PeepholeOpt/PeepholeOptLog2Underflow.lgc
+++ /dev/null
@@ -1,69 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 2
-; RUN: lgc -mcpu=gfx1010 -passes=lgc-peephole-optimization -o - %s | FileCheck --check-prefixes=CHECK %s
-
-; Test that log2 underflow transform is correctly applied.
-
-target triple = "amdgcn--amdpal"
-
-define dllexport spir_func float @lgc.shader.FS.main(float %x, float %y, float %z) local_unnamed_addr #0 !lgc.shaderstage !0 {
-; CHECK-LABEL: define dllexport spir_func float @lgc.shader.FS.main
-; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !lgc.shaderstage !0 {
-; CHECK-NEXT:  .entry:
-; CHECK-NEXT:    [[MUL0A:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[X]], 0x3F9C71C720000000
-; CHECK-NEXT:    [[SUB0A:%.*]] = fsub reassoc nnan nsz arcp contract afn float 1.000000e+00, [[MUL0A]]
-; CHECK-NEXT:    [[TMP0:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.maxnum.f32(float 0.000000e+00, float [[SUB0A]])
-; CHECK-NEXT:    [[LOG0A:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.log2.f32(float [[TMP0]])
-; CHECK-NEXT:    [[MUL0B:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[X]], 0x3F9C71C720000000
-; CHECK-NEXT:    [[SUB0B:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[MUL0B]], 2.000000e+00
-; CHECK-NEXT:    [[TMP1:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.maxnum.f32(float 0.000000e+00, float [[SUB0B]])
-; CHECK-NEXT:    [[LOG0B:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.log2.f32(float [[TMP1]])
-; CHECK-NEXT:    [[MUL1:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[Y]], 0x3F9C71C720000000
-; CHECK-NEXT:    [[SUB1:%.*]] = fsub reassoc nsz arcp contract afn float 1.000000e+00, [[MUL1]]
-; CHECK-NEXT:    [[LOG1:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.log2.f32(float [[SUB1]])
-; CHECK-NEXT:    [[MUL2A:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[Z]], 0x3F9C71C720000000
-; CHECK-NEXT:    [[ADD2A:%.*]] = fadd reassoc nnan nsz arcp contract afn float 1.000000e+00, [[MUL2A]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.maxnum.f32(float 0.000000e+00, float [[ADD2A]])
-; CHECK-NEXT:    [[LOG2A:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.log2.f32(float [[TMP2]])
-; CHECK-NEXT:    [[MUL2B:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[Z]], 0x3F9C71C720000000
-; CHECK-NEXT:    [[ADD2B:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[MUL2B]], 2.000000e+00
-; CHECK-NEXT:    [[TMP3:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.maxnum.f32(float 0.000000e+00, float [[ADD2B]])
-; CHECK-NEXT:    [[LOG2B:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.log2.f32(float [[TMP3]])
-; CHECK-NEXT:    [[RET_0:%.*]] = fadd float [[LOG0A]], [[LOG0B]]
-; CHECK-NEXT:    [[RET_1:%.*]] = fadd float [[RET_0]], [[LOG1]]
-; CHECK-NEXT:    [[RET_2:%.*]] = fadd float [[RET_1]], [[LOG2A]]
-; CHECK-NEXT:    [[RET_3:%.*]] = fadd float [[RET_2]], [[LOG2B]]
-; CHECK-NEXT:    ret float [[RET_3]]
-;
-.entry:
-  %mul0a = fmul reassoc nnan nsz arcp contract afn float %x, 0x3F9C71C720000000
-  %sub0a = fsub reassoc nnan nsz arcp contract afn float 1.000000e+00, %mul0a
-  %log0a = call reassoc nnan nsz arcp contract afn float @llvm.log2.f32(float %sub0a)
-
-  %mul0b = fmul reassoc nnan nsz arcp contract afn float %x, 0x3F9C71C720000000
-  %sub0b = fsub reassoc nnan nsz arcp contract afn float %mul0b, 2.000000e+00
-  %log0b = call reassoc nnan nsz arcp contract afn float @llvm.log2.f32(float %sub0b)
-
-  %mul1 = fmul reassoc nnan nsz arcp contract afn float %y, 0x3F9C71C720000000
-  %sub1 = fsub reassoc nsz arcp contract afn float 1.000000e+00, %mul1
-  %log1 = call reassoc nnan nsz arcp contract afn float @llvm.log2.f32(float %sub1)
-
-  %mul2a = fmul reassoc nnan nsz arcp contract afn float %z, 0x3F9C71C720000000
-  %add2a = fadd reassoc nnan nsz arcp contract afn float 1.000000e+00, %mul2a
-  %log2a = call reassoc nnan nsz arcp contract afn float @llvm.log2.f32(float %add2a)
-
-  %mul2b = fmul reassoc nnan nsz arcp contract afn float %z, 0x3F9C71C720000000
-  %add2b = fadd reassoc nnan nsz arcp contract afn float %mul2b, 2.000000e+00
-  %log2b = call reassoc nnan nsz arcp contract afn float @llvm.log2.f32(float %add2b)
-
-  %ret.0 = fadd float %log0a, %log0b
-  %ret.1 = fadd float %ret.0, %log1
-  %ret.2 = fadd float %ret.1, %log2a
-  %ret.3 = fadd float %ret.2, %log2b
-  ret float %ret.3
-}
-
-declare float @llvm.log2.f32(float) #0
-
-attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-
-!0 = !{i32 6}
diff --git a/lgc/test/UberFetchShader.lgc b/lgc/test/UberFetchShader.lgc
index 7951375e25..97424bc995 100644
--- a/lgc/test/UberFetchShader.lgc
+++ b/lgc/test/UberFetchShader.lgc
@@ -91,7 +91,7 @@ attributes #1 = { nounwind readonly willreturn }
 !2 = !{i32 225099809, i32 -29817230, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800}
 !3 = !{i32 2068278405, i32 41923448, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800}
 !4 = !{!"IndirectUserDataVaPtr", i32 8, i32 0, i32 0, i32 1, i32 4}
-!5 = !{!"DescriptorBufferCompact", i32 10, i32 0, i32 1, i32 2, i32 -1, i32 5, i32 2}
+!5 = !{!"DescriptorBufferCompact", i32 10, i32 0, i32 1, i32 2, i32 -16, i32 5, i32 2}
 !6 = !{i32 0, i32 0, i32 0, i32 12, i32 13, i32 7, i32 -1}
 !7 = !{i32 10}
 !8 = !{i32 4, i32 3}
diff --git a/lgc/test/WorkgroupIdOpt.lgc b/lgc/test/WorkgroupIdOpt.lgc
index a33032b127..f9959f7209 100644
--- a/lgc/test/WorkgroupIdOpt.lgc
+++ b/lgc/test/WorkgroupIdOpt.lgc
@@ -15,7 +15,7 @@ define dllexport spir_func void @lgc.shader.CS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(4)
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr addrspace(4) [[TMP4]], i32 4), "dereferenceable"(ptr addrspace(4) [[TMP4]], i32 -1) ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = call ptr addrspace(7) @lgc.buffer.load.desc.to.ptr(ptr addrspace(4) [[TMP5]], i1 false, i1 false)
+; CHECK-NEXT:    [[TMP6:%.*]] = call ptr addrspace(7) @lgc.buffer.load.desc.to.ptr(ptr addrspace(4) [[TMP5]], i1 false, i1 false, i1 false)
 ; CHECK-NEXT:    [[TMP7:%.*]] = and i32 [[LOCALINVOCATIONID]], 1023
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <3 x i32> poison, i32 [[TMP7]], i64 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = lshr i32 [[LOCALINVOCATIONID]], 10
@@ -78,7 +78,7 @@ define dllexport spir_func void @lgc.shader.CS.main() local_unnamed_addr #0 !spi
   %5 = inttoptr i64 %4 to ptr addrspace(4)
   call void @llvm.assume(i1 true) [ "align"(ptr addrspace(4) %5, i32 4), "dereferenceable"(ptr addrspace(4) %5, i32 -1) ]
   %6 = getelementptr i8, ptr addrspace(4) %5, i32 0
-  %7 = call ptr addrspace(7) @lgc.buffer.load.desc.to.ptr(ptr addrspace(4) %6, i1 false, i1 false)
+  %7 = call ptr addrspace(7) @lgc.buffer.load.desc.to.ptr(ptr addrspace(4) %6, i1 false, i1 false, i1 false)
   %8 = call i32 @lgc.shader.input.LocalInvocationId(i32 50) #2
   %9 = and i32 %8, 1023
   %10 = insertelement <3 x i32> poison, i32 %9, i64 0
@@ -165,7 +165,7 @@ declare noundef i64 @llvm.amdgcn.s.getpc() #3
 declare void @llvm.assume(i1 noundef) #4
 
 ; Function Attrs: nounwind willreturn memory(none)
-declare ptr addrspace(7) @lgc.buffer.load.desc.to.ptr(ptr addrspace(4), i1, i1) #1
+declare ptr addrspace(7) @lgc.buffer.load.desc.to.ptr(ptr addrspace(4), i1, i1, i1) #1
 
 attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" "target-features"=",+wavefrontsize32" }
 attributes #1 = { nounwind willreturn memory(none) }
diff --git a/lgc/test/lgcdis.lgc b/lgc/test/lgcdis.lgc
index 6124595390..855a887c8a 100644
--- a/lgc/test/lgcdis.lgc
+++ b/lgc/test/lgcdis.lgc
@@ -13,7 +13,7 @@
 ; CHECK:  .ps:
 ; CHECK:  .checksum_value: 0x759bd992
 ; CHECK:  .debug_mode:     false
-; CHECK:  .entry_point:    _amdgpu_ps_main
+; CHECK:  .entry_point{{(_symbol)?}}:    _amdgpu_ps_main
 ; CHECK:  .float_mode:     0xc0
 ; CHECK:  .ieee_mode:      false
 ; CHECK:  .mem_ordered:    true
diff --git a/lgc/util/Debug.cpp b/lgc/util/Debug.cpp
index abf923c519..0cbd294fe4 100644
--- a/lgc/util/Debug.cpp
+++ b/lgc/util/Debug.cpp
@@ -28,7 +28,7 @@
  * @brief LLPC header file: middle-end debug functions
  ***********************************************************************************************************************
  */
-#include "lgc/util/Debug.h"
+#include "lgc/Debug.h"
 #include "lgc/LgcContext.h"
 
 using namespace llvm;
@@ -41,4 +41,43 @@ raw_ostream *getLgcOuts() {
   return LgcContext::getLgcOuts();
 }
 
+void InstructionSlot::createFuncSlot(Function *func) {
+  m_iMap.clear();
+  m_valueIndex = 0;
+  for (Argument &arg : func->args())
+    if (!arg.hasName())
+      createSlot(&arg);
+
+  // Add all of the basic blocks and instructions with no names.
+  for (auto &bb : *func) {
+    if (!bb.hasName())
+      createSlot(&bb);
+
+    for (auto &inst : bb) {
+      if (!inst.getType()->isVoidTy())
+        createSlot(&inst);
+    }
+  }
+}
+
+Value *InstructionSlot::getValueByIdx(unsigned idx) {
+  if (m_iMap.find(idx) != m_iMap.end())
+    return m_iMap[idx];
+  return nullptr;
+}
+
+Value *InstructionSlot::getValueByName(StringRef name) {
+  if (m_nMap.find(name) != m_nMap.end())
+    return m_nMap[name];
+  return nullptr;
+}
+
+void InstructionSlot::createSlot(Value *val) {
+  if (val->hasName())
+    m_nMap[val->getName()] = val;
+  else {
+    unsigned destSlot = m_valueIndex++;
+    m_iMap[destSlot] = val;
+  }
+}
 } // namespace lgc
diff --git a/lgc/util/GfxRegHandler.cpp b/lgc/util/GfxRegHandler.cpp
index c91b353861..f24e8561cd 100644
--- a/lgc/util/GfxRegHandler.cpp
+++ b/lgc/util/GfxRegHandler.cpp
@@ -274,6 +274,7 @@ Value *SqImgRsrcRegHandler::getReg(SqRsrcRegs regId) {
     case 11:
       return m_builder->CreateAdd(
           getRegCombine(static_cast<unsigned>(SqRsrcRegs::WidthLo), static_cast<unsigned>(SqRsrcRegs::WidthHi)), m_one);
+      return m_builder->CreateAdd(getRegCommon(static_cast<unsigned>(SqRsrcRegs::Width)), m_one);
     default:
       llvm_unreachable("GFX IP is not supported!");
       break;
@@ -329,6 +330,8 @@ void SqImgRsrcRegHandler::setReg(SqRsrcRegs regId, Value *regValue) {
       setRegCombine(static_cast<unsigned>(SqRsrcRegs::WidthLo), static_cast<unsigned>(SqRsrcRegs::WidthHi),
                     m_builder->CreateSub(regValue, m_one));
       break;
+      setRegCommon(static_cast<unsigned>(SqRsrcRegs::Width), m_builder->CreateSub(regValue, m_one));
+      break;
     default:
       llvm_unreachable("GFX IP is not supported!");
       break;
diff --git a/lgc/util/PassManager.cpp b/lgc/util/PassManager.cpp
index fc4ba3328e..0aeb6bb710 100644
--- a/lgc/util/PassManager.cpp
+++ b/lgc/util/PassManager.cpp
@@ -30,8 +30,8 @@
  */
 #include "lgc/PassManager.h"
 #include "compilerutils/MbStandardInstrumentations.h"
+#include "lgc/Debug.h"
 #include "lgc/LgcContext.h"
-#include "lgc/util/Debug.h"
 #include "llvm/Analysis/CFGPrinter.h"
 #include "llvm/IR/PrintPasses.h"
 #include "llvm/IR/Verifier.h"
@@ -127,6 +127,7 @@ class MbPassManagerImpl final : public lgc::MbPassManager {
   MbPassManagerImpl(TargetMachine *targetMachine);
   void registerPass(StringRef passName, StringRef className) override;
   void run(ModuleBunch &moduleBunch) override;
+  void setPassIndex(unsigned *passIndex) override { m_passIndex = passIndex; }
   PassInstrumentationCallbacks &getInstrumentationCallbacks() override { return m_instrumentationCallbacks; }
   bool stopped() const override { return m_stopped; }
 
@@ -140,6 +141,7 @@ class MbPassManagerImpl final : public lgc::MbPassManager {
   CGSCCAnalysisManager m_cgsccAnalysisManager;             // CGSCC analysis manager used when running the passes.
   PassInstrumentationCallbacks m_instrumentationCallbacks; // Instrumentation callbacks ran when running the passes.
   MbStandardInstrumentations m_instrumentationStandard;    // LLVM's Standard instrumentations
+  unsigned *m_passIndex = nullptr;                         // Pass Index.
   bool m_initialized = false;                              // Whether the pass manager is initialized or not
   bool m_stopped = false;
   std::string m_stopAfter;
@@ -373,10 +375,31 @@ void PassManagerImpl::registerCallbacks() {
 // Register LLPC's custom callbacks
 //
 void MbPassManagerImpl::registerCallbacks() {
+  auto beforePass = [this](StringRef passName, Any ir) {
+    if (passName != PrintModuleBunchPass::name() && passName != PrintModulePass::name() && m_passIndex) {
+      unsigned passIndex = (*m_passIndex)++;
+      if (cl::DumpPassName)
+        LLPC_OUTS("Pass[" << passIndex << "] = " << passName << "\n");
+    }
+  };
+  m_instrumentationCallbacks.registerBeforeSkippedPassCallback(beforePass);
+  m_instrumentationCallbacks.registerBeforeNonSkippedPassCallback(beforePass);
+
   m_instrumentationCallbacks.registerShouldRunOptionalPassCallback([this](StringRef className, Any ir) { // NOLINT
     if (m_stopped)
       return false;
 
+    // Check if the user disabled that specific pass index.
+    if (className != PrintModuleBunchPass::name() && className != PrintModulePass::name() && m_passIndex) {
+      unsigned passIndex = *m_passIndex;
+      for (auto disableIndex : cl::DisablePassIndices) {
+        if (disableIndex == passIndex) {
+          LLPC_OUTS("Pass[" << passIndex << "] = " << className << " (disabled)\n");
+          return false;
+        }
+      }
+    }
+
     StringRef passName = m_instrumentationCallbacks.getPassNameForClassName(className);
     if (!m_stopAfter.empty() && passName == m_stopAfter) {
       // This particular pass still gets to run, but we skip everything afterwards.
diff --git a/lgc/util/StartStopTimer.cpp b/lgc/util/StartStopTimer.cpp
index 1949ba1153..d771240ad5 100644
--- a/lgc/util/StartStopTimer.cpp
+++ b/lgc/util/StartStopTimer.cpp
@@ -102,7 +102,7 @@ ModulePass *LgcContext::createStartStopTimer(Timer *timer, bool starting) {
 // @param passMgr : Pass manager to add the pass to
 // @param timer : The timer to start or stop when the pass is run
 // @param starting : True to start the timer, false to stop it
-void LgcContext::createAndAddStartStopTimer(lgc::PassManager &passMgr, Timer *timer, bool starting) {
+void LgcContext::createAndAddStartStopTimer(ModulePassManager &passMgr, Timer *timer, bool starting) {
   passMgr.addPass(StartStopTimer(timer, starting));
 }
 
diff --git a/llpc/CMakeLists.txt b/llpc/CMakeLists.txt
index dc17862d8a..0011291d86 100644
--- a/llpc/CMakeLists.txt
+++ b/llpc/CMakeLists.txt
@@ -199,6 +199,10 @@ if(ICD_BUILD_LLPC)
         lowering/ScalarReplacementOfBuiltins.h
         lowering/ProcessGfxRuntimeLibrary.cpp
         lowering/ProcessGfxRuntimeLibrary.h
+        lowering/LinkTransformShaders.cpp
+        lowering/LinkTransformShaders.h
+        lowering/PrepareTransformVertexShader.cpp
+        lowering/PrepareTransformVertexShader.h
     )
 
 # llpc/translator
@@ -248,6 +252,8 @@ if(ICD_BUILD_LLPC)
         util/llpcFile.h
         util/llpcShaderModuleHelper.cpp
         util/llpcShaderModuleHelper.h
+        util/llpcThreading.cpp
+        util/llpcThreading.h
         util/llpcTimerProfiler.cpp
         util/llpcTimerProfiler.h
         util/llpcUtil.cpp
diff --git a/llpc/context/llpcCompiler.cpp b/llpc/context/llpcCompiler.cpp
index 1dbe40065f..23fb17a2d7 100644
--- a/llpc/context/llpcCompiler.cpp
+++ b/llpc/context/llpcCompiler.cpp
@@ -30,17 +30,23 @@
  */
 #include "llpcCompiler.h"
 #include "LLVMSPIRVLib.h"
+#include "LinkTransformShaders.h"
+#include "LowerAccessChain.h"
 #include "LowerAdvancedBlend.h"
 #include "LowerCfgMerges.h"
+#include "LowerGlCompatibility.h"
+#include "LowerGlobals.h"
 #include "LowerRayTracing.h"
 #include "LowerTranslator.h"
 #include "Lowering.h"
 #include "LoweringUtil.h"
 #include "PrepareContinuations.h"
+#include "PrepareTransformVertexShader.h"
 #include "SPIRVEntry.h"
 #include "SPIRVFunction.h"
 #include "SPIRVInstruction.h"
 #include "SPIRVInternal.h"
+#include "ScalarReplacementOfBuiltins.h"
 #include "llpcCacheAccessor.h"
 #include "llpcComputeContext.h"
 #include "llpcContext.h"
@@ -58,6 +64,7 @@
 #include "vkgcDefs.h"
 #include "vkgcElfReader.h"
 #include "vkgcPipelineDumper.h"
+#include "compilerutils/ModuleBunch.h"
 #include "llvmraytracing/Continuations.h"
 #include "llvmraytracing/GpurtContext.h"
 #include "lgc/Builder.h"
@@ -90,6 +97,7 @@
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/Transforms/IPO/GlobalDCE.h"
 #include "llvm/Transforms/Scalar/SROA.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include <cassert>
@@ -227,21 +235,7 @@ static MetroHash::Hash SOptionHash = {};
 unsigned Compiler::m_instanceCount = 0;
 unsigned Compiler::m_outRedirectCount = 0;
 
-// Represents the payload used by helper thread to build ray tracing Elf
-struct HelperThreadBuildRayTracingPipelineElfPayload {
-  ArrayRef<Module *> modules;                         // Modules to generate ELF packages
-  std::vector<ElfPackage> &pipelineElfs;              // Output ELF packages
-  std::vector<RayTracingShaderProperty> &shaderProps; // Output RayTracingShaderProperty
-  std::vector<bool> &moduleCallsTraceRay;             // Whether each module calls OpTraceRay
-  std::vector<Result> &results;                       // Build result of each module
-  RayTracingContext *rayTracingContext;               // The ray tracing context across the pipeline
-  Compiler *compiler;                                 // The compiler instance
-  std::atomic<bool> helperThreadJoined;               // Whether helper thread has joined
-  std::atomic<bool> mainThreadSwitchedContext;        // Whether main thread has finished switching context
-};
-
 sys::Mutex Compiler::m_helperThreadMutex;
-std::condition_variable_any Compiler::m_helperThreadConditionVariable;
 
 // =====================================================================================================================
 // Handler for LLVM fatal error.
@@ -1879,6 +1873,12 @@ Result Compiler::buildPipelineInternal(Context *context, ArrayRef<const Pipeline
     if (enableAdvancedBlend)
       context->ensureGfxRuntimeLibrary();
 
+    bool isTransformPipeline = false;
+    auto computePipelineInfo =
+        static_cast<const ComputePipelineBuildInfo *>(context->getPipelineContext()->getPipelineBuildInfo());
+    if (computePipelineInfo != nullptr && computePipelineInfo->transformGraphicsPipeline != nullptr)
+      isTransformPipeline = true;
+
     for (unsigned shaderIndex = 0; shaderIndex < shaderInfo.size() && result == Result::Success; ++shaderIndex) {
       const PipelineShaderInfo *shaderInfoEntry = shaderInfo[shaderIndex];
       if (!shaderInfoEntry || !shaderInfoEntry->pModuleData)
@@ -1932,6 +1932,9 @@ Result Compiler::buildPipelineInternal(Context *context, ArrayRef<const Pipeline
                                     "// LLPC SPIRV-to-LLVM translation results\n"));
       }
 
+      if (isTransformPipeline)
+        lowerPassMgr->addPass(LinkTransformShaders());
+
       if (moduleData->usage.enableRayQuery) {
         assert(!moduleData->usage.rayQueryLibrary);
         context->ensureGpurtLibrary();
@@ -2067,6 +2070,75 @@ Result Compiler::buildPipelineInternal(Context *context, ArrayRef<const Pipeline
   return result;
 }
 
+// =====================================================================================================================
+// Build the transform vertex shader, convert the main function to TransformVertexEntry
+//
+// @param context : LLPC context
+// @param shaderInfo : Vertex shader to be processed
+// @param outStream : Bitcode of transform verte shader module
+// @returns : Returns Success if the vertex shader is compiled successfully
+Result Compiler::buildTransformVertexShader(Context *context, const PipelineShaderInfo *shaderInfo,
+                                            raw_pwrite_stream &outStream) {
+  Result result = Result::Success;
+
+  bool hasError = false;
+  context->setDiagnosticHandler(std::make_unique<LlpcDiagnosticHandler>(&hasError));
+
+  // Set up middle-end objects.
+  LgcContext *builderContext = context->getLgcContext();
+  std::unique_ptr<Pipeline> pipeline(builderContext->createPipeline());
+  context->getPipelineContext()->setPipelineState(&*pipeline, /*hasher=*/nullptr, false);
+  context->setBuilder(builderContext->createBuilder(&*pipeline));
+
+  // Get transform vertex shader library, bitcode will be returned
+  auto moduleData = reinterpret_cast<const ShaderModuleData *>(shaderInfo->pModuleData);
+  const_cast<ShaderModuleData *>(moduleData)->usage.keepUnusedFunctions = true;
+
+  auto gfxRuntime = std::make_unique<Module>("transformruntime", *context);
+  context->setModuleTargetMachine(gfxRuntime.get());
+
+  TimerProfiler timerProfiler(context->getPipelineHashCode(), "LLPC GfxRuntime",
+                              TimerProfiler::PipelineTimerEnableMask);
+  std::unique_ptr<lgc::PassManager> lowerPassMgr(lgc::PassManager::Create(context->getLgcContext()));
+  SpirvLower::registerTranslationPasses(*lowerPassMgr);
+
+  timerProfiler.addTimerStartStopPass(*lowerPassMgr, TimerTranslate, true);
+
+  lowerPassMgr->addPass(LowerTranslator(ShaderStageVertex, shaderInfo));
+  if (EnableOuts()) {
+    lowerPassMgr->addPass(
+        PrintModulePass(outs(), "\n"
+                                "===============================================================================\n"
+                                "// LLPC SPIRV-to-LLVM translation results for transform vertex shader\n"));
+  }
+
+  // Function inlining. Use the "always inline" pass, since we want to inline all functions, and
+  // we marked (non-entrypoint) functions as "always inline" just after SPIR-V reading.
+  lowerPassMgr->addPass(AlwaysInlinerPass());
+  lowerPassMgr->addPass(GlobalDCEPass());
+
+  // Lower SPIR-V access chain
+  lowerPassMgr->addPass(LowerAccessChain());
+
+  // Split up and replace global variables that are structs of builtins.
+  lowerPassMgr->addPass(ScalarReplacementOfBuiltins());
+
+  // Lower Glsl compatibility variables and operations
+  lowerPassMgr->addPass(LowerGlCompatibility());
+
+  lowerPassMgr->addPass(PrepareTransformVertexShader());
+
+  // Lower SPIR-V global variables, inputs, and outputs
+  lowerPassMgr->addPass(LowerGlobals());
+
+  lowerPassMgr->addPass(BitcodeWriterPass(outStream));
+  timerProfiler.addTimerStartStopPass(*lowerPassMgr, TimerTranslate, false);
+
+  lowerPassMgr->run(*gfxRuntime);
+
+  return result;
+}
+
 // =====================================================================================================================
 // Check shader cache for graphics pipeline, returning mask of which shader stages we want to keep in this compile.
 // This is called from the CheckShaderCache pass (via a lambda in BuildPipelineInternal), to remove
@@ -2549,12 +2621,36 @@ Result Compiler::buildComputePipelineInternal(ComputeContext *computeContext,
 // @param pipelineDumpFile : Handle of pipeline dump file
 Result Compiler::BuildComputePipeline(const ComputePipelineBuildInfo *pipelineInfo,
                                       ComputePipelineBuildOut *pipelineOut, void *pipelineDumpFile) {
+  Result result = Result::Success;
   BinaryData elfBin = {};
+  SmallString<16> outBuffer;
+  raw_svector_ostream outStream(outBuffer);
+
+  // Compile transform vertex shader if it is a transform pipeline
+  auto gfxPipelineInfo = pipelineInfo->transformGraphicsPipeline;
+  if (gfxPipelineInfo != nullptr) {
+    result = validatePipelineShaderInfo(&gfxPipelineInfo->vs);
+    if (result != Result::Success)
+      return result;
+
+    MetroHash::Hash vtxCacheHash = PipelineDumper::generateHashForGraphicsPipeline(gfxPipelineInfo, true);
+    MetroHash::Hash vtxPipelineHash = PipelineDumper::generateHashForGraphicsPipeline(gfxPipelineInfo, false);
+
+    GraphicsContext graphicsContext(m_gfxIp, m_apiName, gfxPipelineInfo, &vtxPipelineHash, &vtxCacheHash);
+    Context *context = acquireContext();
+    context->attachPipelineContext(&graphicsContext);
+
+    result = buildTransformVertexShader(context, &gfxPipelineInfo->vs, outStream);
+    releaseContext(context);
+
+    if (result != Result::Success)
+      return result;
+  }
 
   const bool relocatableElfRequested = pipelineInfo->options.enableRelocatableShaderElf || cl::UseRelocatableShaderElf;
   const bool buildUsingRelocatableElf = relocatableElfRequested && canUseRelocatableComputeShaderElf(pipelineInfo);
 
-  Result result = validatePipelineShaderInfo(&pipelineInfo->cs);
+  result = validatePipelineShaderInfo(&pipelineInfo->cs);
   if (result != Result::Success)
     return result;
 
@@ -2589,7 +2685,8 @@ Result Compiler::BuildComputePipeline(const ComputePipelineBuildInfo *pipelineIn
   ElfPackage candidateElf;
   if (!cacheAccessor || !cacheAccessor->isInCache()) {
     LLPC_OUTS("Cache miss for compute pipeline.\n");
-    ComputeContext *computeContext = new ComputeContext(m_gfxIp, m_apiName, pipelineInfo, &pipelineHash, &cacheHash);
+    ComputeContext *computeContext =
+        new ComputeContext(m_gfxIp, m_apiName, pipelineInfo, outStream.str(), &pipelineHash, &cacheHash);
     result = buildComputePipelineInternal(computeContext, pipelineInfo, buildUsingRelocatableElf, &candidateElf,
                                           &pipelineOut->stageCacheAccess);
     delete computeContext;
@@ -2711,7 +2808,7 @@ Result Compiler::BuildRayTracingPipeline(const RayTracingPipelineBuildInfo *pipe
   }
 
   std::vector<const PipelineShaderInfo *> rayTracingShaderInfo;
-  rayTracingShaderInfo.reserve(pipelineInfo->shaderCount + 1);
+  rayTracingShaderInfo.reserve(pipelineInfo->shaderCount);
   for (unsigned i = 0; i < pipelineInfo->shaderCount; ++i) {
     rayTracingShaderInfo.push_back(&pipelineInfo->pShaders[i]);
     auto &shaderInfo = rayTracingShaderInfo[i];
@@ -2732,12 +2829,6 @@ Result Compiler::BuildRayTracingPipeline(const RayTracingPipelineBuildInfo *pipe
     rayTracingContext.updateRayFlagsKnownBits(knownBits);
   }
 
-  // Add entry module
-  PipelineShaderInfo raygenMainShaderInfo = pipelineInfo->pShaders[0];
-  raygenMainShaderInfo.entryStage = ShaderStageRayTracingRayGen;
-  raygenMainShaderInfo.pModuleData = nullptr;
-  rayTracingShaderInfo.push_back(&raygenMainShaderInfo);
-
   result = buildRayTracingPipelineInternal(rayTracingContext, rayTracingShaderInfo, false, elfBinarys, shaderProps,
                                            helperThreadProvider);
 
@@ -2756,12 +2847,23 @@ Result Compiler::BuildRayTracingPipeline(const RayTracingPipelineBuildInfo *pipe
     size_t elfSize = 0;
 
     for (auto &elf : elfBinarys) {
-      if (elf.size() % 8 != 0) {
-        elf.resize(alignTo(elf.size(), alignof(BinaryData)));
+      // Align each individual elf to a multiple of 8, iff it is actually an ELF; otherwise it comes from -emit-lgc,
+      // -emit-llvm or -filetype=asm, and alignment will add extra nul bytes to textual output.
+      if (elf.size() >= 4 && elf.starts_with("\177ELF")) {
+        if (elf.size() % 8 != 0) {
+          elf.resize(alignTo(elf.size(), alignof(BinaryData)));
+        }
       }
       elfSize += elf.size();
     }
 
+    // Make sure Vkgc::BinaryData address alignment, which requires 8 byte alignment
+    size_t elfSizeGap = 0;
+    if (elfSize % 8 != 0) {
+      elfSizeGap = alignTo(elfSize, alignof(BinaryData)) - elfSize;
+      elfSize += elfSizeGap;
+    }
+
     size_t allocSize = elfSize;
     allocSize += binaryDataSize;
 
@@ -2792,7 +2894,8 @@ Result Compiler::BuildRayTracingPipeline(const RayTracingPipelineBuildInfo *pipe
     }
     pipelineOut->pipelineBins = pipelineBins;
 
-    allocBuf = voidPtrInc(allocBuf, binaryDataSize);
+    // Plus gap to make sure address alignment
+    allocBuf = voidPtrInc(allocBuf, binaryDataSize + elfSizeGap);
     pipelineOut->shaderPropSet.shaderCount = shaderProps.size();
     pipelineOut->shaderPropSet.traceRayIndex = shaderProps.size() - 1;
     if (!shaderProps.empty()) {
@@ -2854,7 +2957,7 @@ Result Compiler::BuildRayTracingPipeline(const RayTracingPipelineBuildInfo *pipe
 Result Compiler::buildRayTracingPipelineElf(Context *context, std::unique_ptr<Module> module, ElfPackage &pipelineElf,
                                             std::vector<RayTracingShaderProperty> &shaderProps,
                                             std::vector<bool> &moduleCallsTraceRay, unsigned moduleIndex,
-                                            std::unique_ptr<Pipeline> &pipeline, TimerProfiler &timerProfiler) {
+                                            Pipeline &pipeline, TimerProfiler &timerProfiler) {
   auto rtContext = static_cast<RayTracingContext *>(context->getPipelineContext());
   if (moduleIndex > 0) {
     auto &shaderProp = shaderProps[moduleIndex - 1];
@@ -2875,7 +2978,7 @@ Result Compiler::buildRayTracingPipelineElf(Context *context, std::unique_ptr<Mo
     shaderProp.shaderIdExtraBits = shaderIdExtraBits;
   }
 
-  auto options = pipeline->getOptions();
+  auto options = pipeline.getOptions();
   MetroHash64 hasher;
   MetroHash::Hash hash = {};
   hasher.Update(options.hash[1]);
@@ -2913,9 +3016,9 @@ Result Compiler::buildRayTracingPipelineElf(Context *context, std::unique_ptr<Mo
     }
   }
 
-  pipeline->setOptions(options);
+  pipeline.setOptions(options);
 
-  generatePipeline(context, moduleIndex, std::move(module), pipelineElf, pipeline.get(), timerProfiler);
+  generatePipeline(context, moduleIndex, std::move(module), pipelineElf, &pipeline, timerProfiler);
 
   if (moduleIndex > 0)
     adjustRayTracingElf(&pipelineElf, rtContext, shaderProps[moduleIndex - 1]);
@@ -2966,134 +3069,6 @@ void Compiler::setUseGpurt(lgc::Pipeline *pipeline) {
   pipeline->setOptions(options);
 }
 
-// =====================================================================================================================
-// Build single ray tracing pipeline ELF package.
-//
-// @param IHelperThreadProvider : The helper thread provider
-// @param payload : Payload to build ray tracing pipeline Elf package
-void helperThreadBuildRayTracingPipelineElf(IHelperThreadProvider *helperThreadProvider, void *payload) {
-  HelperThreadBuildRayTracingPipelineElfPayload *helperThreadPayload =
-      static_cast<HelperThreadBuildRayTracingPipelineElfPayload *>(payload);
-
-  helperThreadPayload->helperThreadJoined = true;
-
-  unsigned moduleIndex = 0;
-
-  // No remaining tasks, do not proceed
-  if (helperThreadProvider->GetNextTask(&moduleIndex) == false)
-    return;
-
-  // Set up context for each helper thread
-  Context *context = helperThreadPayload->compiler->acquireContext();
-
-  bool hasError = false;
-  context->setDiagnosticHandler(std::make_unique<LlpcDiagnosticHandler>(&hasError));
-
-  context->attachPipelineContext(helperThreadPayload->rayTracingContext);
-
-  LgcContext *builderContext = context->getLgcContext();
-  std::unique_ptr<Pipeline> pipeline(builderContext->createPipeline());
-  helperThreadPayload->rayTracingContext->setPipelineState(&*pipeline, /*hasher=*/nullptr, false);
-  context->setBuilder(builderContext->createBuilder(&*pipeline));
-
-  context->ensureGpurtLibrary();
-  helperThreadPayload->compiler->setUseGpurt(&*pipeline);
-
-  TimerProfiler timerProfiler(context->getPipelineHashCode(), "LLPC", TimerProfiler::PipelineTimerEnableMask);
-
-  {
-    // Block the helper thread until main thread has switched context, see comment in
-    // Compiler::buildRayTracingPipelineInternal for why we need this.
-    std::unique_lock<sys::Mutex> lock(helperThreadPayload->compiler->getHelperThreadMutex());
-    helperThreadPayload->compiler->getHelperThreadConditionVariable().wait(
-        lock, [helperThreadPayload]() { return helperThreadPayload->mainThreadSwitchedContext.load(); });
-  }
-
-  do {
-    // NOTE: All modules were in the same context, which is not thread safe. We need to 'clone' the module into separate
-    // context here to ensure we can do the work simultaneously. We achieve this by outputting the module as bitcode and
-    // read it back in another context.
-    Module *originalModule = helperThreadPayload->modules[moduleIndex];
-
-    // FIXME: There will be out of sync assertion when the main thread is doing something related to context (probably
-    // in PipelineState::generate), and the helper thread is using bitcode writer, we need to find a decent solution for
-    // such situation.
-    SmallVector<char, 0> bcBuffer;
-    BitcodeWriter bcWriter(bcBuffer);
-    bcWriter.writeModule(*originalModule);
-    bcWriter.writeSymtab();
-    bcWriter.writeStrtab();
-
-    SmallVectorMemoryBuffer bcMemBuf(std::move(bcBuffer), originalModule->getName());
-    auto moduleOrErr = getLazyBitcodeModule(std::move(bcMemBuf), *context);
-    std::unique_ptr<Module> module = nullptr;
-
-    if (!moduleOrErr) {
-      LLPC_ERRS("Failed to load bit code\n");
-      helperThreadPayload->results[moduleIndex] = Result::ErrorInvalidShader;
-      helperThreadProvider->TaskCompleted();
-      continue;
-    }
-
-    module = std::move(*moduleOrErr);
-    if (Error errCode = module->materializeAll()) {
-      LLPC_ERRS("Failed to materialize module\n");
-      module = nullptr;
-      helperThreadPayload->results[moduleIndex] = Result::ErrorInvalidShader;
-      helperThreadProvider->TaskCompleted();
-      continue;
-    }
-    auto result = helperThreadPayload->compiler->buildRayTracingPipelineElf(
-        context, std::move(module), helperThreadPayload->pipelineElfs[moduleIndex], helperThreadPayload->shaderProps,
-        helperThreadPayload->moduleCallsTraceRay, moduleIndex, pipeline, timerProfiler);
-
-    helperThreadPayload->results[moduleIndex] = hasError ? Result::ErrorInvalidShader : result;
-
-    helperThreadProvider->TaskCompleted();
-  } while (helperThreadProvider->GetNextTask(&moduleIndex));
-
-  context->setDiagnosticHandler(nullptr);
-  helperThreadPayload->compiler->releaseContext(context);
-}
-
-// =====================================================================================================================
-// Limited implementation of Llpc::IHelperThreadProvider to support -add-rt-helpers.
-//
-// If no deferred work helper thread providers is available when additional threads are requested via -add-rt-helpers
-// then use an instances of this class to coordinate helper threads.
-class InternalHelperThreadProvider : public Llpc::IHelperThreadProvider {
-public:
-  virtual void SetTasks(ThreadFunction *pFunction, uint32_t numTasks, void *pPayload) override {
-    assert(!m_totalInstances && "InternalHelperThreadProvider is single use");
-    m_totalInstances = numTasks;
-  }
-
-  virtual bool GetNextTask(uint32_t *pTaskIndex) override {
-    assert(pTaskIndex != nullptr);
-    *pTaskIndex = m_nextInstance.fetch_add(1);
-    return (*pTaskIndex < m_totalInstances);
-  }
-
-  virtual void TaskCompleted() override {
-    uint32_t completedInstances = m_completedInstances.fetch_add(1) + 1;
-    if (completedInstances == m_totalInstances)
-      m_event.notify_all();
-  }
-
-  virtual void WaitForTasks() override {
-    std::unique_lock<std::mutex> lock(m_lock);
-    while (m_completedInstances < m_totalInstances)
-      m_event.wait(lock);
-  }
-
-private:
-  uint32_t m_totalInstances = 0;
-  std::atomic<uint32_t> m_nextInstance = 0;
-  std::atomic<uint32_t> m_completedInstances = 0;
-  std::condition_variable m_event;
-  std::mutex m_lock;
-};
-
 // =====================================================================================================================
 // Build raytracing pipeline internally
 //
@@ -3126,28 +3101,29 @@ Result Compiler::buildRayTracingPipelineInternal(RayTracingContext &rtContext,
   std::unique_ptr<Pipeline> pipeline(builderContext->createPipeline());
   rtContext.setPipelineState(&*pipeline, /*hasher=*/nullptr, unlinked);
 
-  std::vector<std::unique_ptr<Module>> modules(shaderInfo.size());
   mainContext->setBuilder(builderContext->createBuilder(&*pipeline));
-
   mainContext->ensureGpurtLibrary();
 
+  ModuleBunch bunch;
+
+  {
+    auto leadModule = std::make_unique<Module>("main", *mainContext);
+    mainContext->setModuleTargetMachine(leadModule.get());
+    bunch.addModule(std::move(leadModule));
+  }
+
   // Create empty modules and set target machine in each.
   for (unsigned shaderIndex = 0; shaderIndex < shaderInfo.size(); ++shaderIndex) {
     const PipelineShaderInfo *shaderInfoEntry = shaderInfo[shaderIndex];
+    assert(shaderInfoEntry->pModuleData);
     std::string moduleName;
-    if (shaderInfoEntry->pModuleData) {
-      moduleName = (Twine("_") + getShaderStageAbbreviation(shaderInfoEntry->entryStage) + "_" +
-                    Twine(getModuleIdByIndex(shaderIndex)))
-                       .str();
-      moduleName[1] = std::tolower(moduleName[1]);
-    } else {
-      moduleName = "main";
-    }
-    modules[shaderIndex] = std::make_unique<Module>(moduleName, *mainContext);
-    mainContext->setModuleTargetMachine(modules[shaderIndex].get());
+    moduleName = (Twine("_") + getShaderStageAbbreviation(shaderInfoEntry->entryStage) + "_" +
+                  Twine(getModuleIdByIndex(shaderIndex)))
+                     .str();
+    moduleName[1] = std::tolower(moduleName[1]);
 
-    if (!shaderInfoEntry->pModuleData)
-      continue;
+    auto module = std::make_unique<Module>(moduleName, *mainContext);
+    mainContext->setModuleTargetMachine(module.get());
 
     std::unique_ptr<lgc::PassManager> lowerPassMgr(lgc::PassManager::Create(builderContext));
     lowerPassMgr->setPassIndex(&passIndex);
@@ -3159,11 +3135,12 @@ Result Compiler::buildRayTracingPipelineInternal(RayTracingContext &rtContext,
     lowerPassMgr->addPass(AlwaysInlinerPass());
 
     // Run the passes.
-    lowerPassMgr->run(*modules[shaderIndex]);
+    lowerPassMgr->run(*module);
+
+    bunch.addModule(std::move(module));
   }
 
   // Step 2: Set up traversal module and kernel entry
-  std::vector<std::unique_ptr<Module>> newModules;
   // Record which module calls TraceRay(), except the first one (For indirect mode, it is the entry function which will
   // never call TraceRay(). For inlined mode, we don't need to care).
   std::vector<bool> moduleCallsTraceRay;
@@ -3178,18 +3155,9 @@ Result Compiler::buildRayTracingPipelineInternal(RayTracingContext &rtContext,
 
   const bool isContinuationsMode = rtContext.isContinuationsMode();
 
-  std::unique_ptr<Module> entry = std::move(modules.back());
-  modules.pop_back();
-  shaderInfo = shaderInfo.drop_back();
-
-  newModules.push_back(std::move(entry));
-
   for (unsigned shaderIndex = 0; shaderIndex < pipelineInfo->shaderCount; ++shaderIndex) {
     const auto *shaderInfoEntry = shaderInfo[shaderIndex];
     const ShaderModuleData *moduleData = reinterpret_cast<const ShaderModuleData *>(shaderInfoEntry->pModuleData);
-    auto shaderModule = std::move(modules[shaderIndex]);
-
-    newModules.push_back(std::move(shaderModule));
     moduleCallsTraceRay.push_back(moduleData->usage.hasTraceRay);
   }
 
@@ -3226,47 +3194,58 @@ Result Compiler::buildRayTracingPipelineInternal(RayTracingContext &rtContext,
       }
     }
 
-    newModules.push_back(std::move(traversal));
+    bunch.addModule(std::move(traversal));
     moduleCallsTraceRay.push_back(false);
   }
 
-  assert(moduleCallsTraceRay.size() == (newModules.size() - 1));
+  assert(moduleCallsTraceRay.size() == bunch.size() - 1);
 
-  // Step 3: Run lower passes on all modules
-  for (unsigned i = 0; i < newModules.size(); i++) {
-    auto module = (newModules[i].get());
-    std::unique_ptr<lgc::PassManager> passMgr(lgc::PassManager::Create(builderContext));
+  // Steps 3 & 4:
+  // - Run lower passes on all modules
+  // - Merge all modules and inline if necessary
+  {
+    Timer *lowerTimer = timerProfiler.getTimer(TimerLower);
+    auto passMgr = lgc::MbPassManager::Create(builderContext->getTargetMachine());
+    passMgr->setPassIndex(&passIndex);
     SpirvLower::registerLoweringPasses(*passMgr);
-    LowerFlag flag = {};
-    flag.isRayTracing = true;
-    flag.isInternalRtShader = false;
-    SpirvLower::addPasses(mainContext, ShaderStageCompute, *passMgr, timerProfiler.getTimer(TimerLower), flag);
-    if (isContinuationsMode) {
-      passMgr->addPass(PrepareContinuations());
-    }
-    passMgr->run(*module);
-  }
 
-  // Step 4: Link module if necessary
-  if (indirectStageMask == 0) {
-    auto &mainModule = newModules[0];
-    Linker linker(*mainModule);
-    for (unsigned i = 1; i < newModules.size(); ++i) {
-      linker.linkInModule(std::move(newModules[i]));
+    passMgr->addPass(ModuleBunchToModulePassAdaptor([mainContext, isContinuationsMode, lowerTimer]() {
+      ModulePassManager mpm;
+      LowerFlag flag = {};
+      flag.isRayTracing = true;
+      flag.isInternalRtShader = false;
+      SpirvLower::addPasses(mainContext, ShaderStageCompute, mpm, lowerTimer, flag);
+      if (isContinuationsMode) {
+        mpm.addPass(PrepareContinuations());
+      }
+      return createForModuleBunchToModulePassAdaptor(std::move(mpm));
+    }));
+
+    if (indirectStageMask == 0) {
+      passMgr->addPass(MergeModulesPass());
+
+      passMgr->addPass(ModuleBunchToModulePassAdaptor([]() {
+        ModulePassManager mpm;
+        mpm.addPass(AlwaysInlinerPass());
+        mpm.addPass(ClearNonEntryFunctionsPass("main"));
+        return createForModuleBunchToModulePassAdaptor(std::move(mpm));
+      }));
     }
-    std::unique_ptr<lgc::PassManager> passMgr(lgc::PassManager::Create(builderContext));
-    passMgr->addPass(AlwaysInlinerPass());
-    passMgr->run(*mainModule);
-    clearNonEntryFunctions(mainModule.get(), "main");
-    newModules.erase(newModules.begin() + 1, newModules.end());
+
+    passMgr->run(bunch);
   }
 
+  // Step 5: Generate ELFs
+  std::vector<std::unique_ptr<Module>> newModules;
+  for (auto &module : bunch.getMutableModules())
+    newModules.push_back(std::move(module));
+
   rtContext.setLinked(true);
   pipelineElfs.resize(newModules.size());
   shaderProps.resize(newModules.size() - 1);
 
   // Take entry module, it will be handled at last.
-  entry = std::move(newModules[0]);
+  std::unique_ptr<Module> entry = std::move(newModules[0]);
 
   std::unique_ptr<Module> traversalModule;
   if (indirectStageMask != 0 && needTraversal) {
@@ -3275,74 +3254,95 @@ Result Compiler::buildRayTracingPipelineInternal(RayTracingContext &rtContext,
     rtContext.getRayTracingLibrarySummary().hasTraceRayModule = true;
   }
 
-  InternalHelperThreadProvider ourHelperThreadProvider;
-  if (cl::AddRtHelpers && !helperThreadProvider)
-    helperThreadProvider = &ourHelperThreadProvider;
+  struct HelperContext {
+    Context *context = nullptr;
+    LgcContext *builderContext = nullptr;
+    std::unique_ptr<Pipeline> pipeline;
+    TimerProfiler timerProfiler;
+    bool hasError = false;
 
-  // Step 5: Generate ELFs
-  if (helperThreadProvider) {
-    std::vector<Result> results(newModules.size(), Result::Success);
-    std::vector<Module *> modulePointers;
-    for (const auto &module : newModules)
-      modulePointers.push_back(module.get());
-    HelperThreadBuildRayTracingPipelineElfPayload helperThreadPayload = {
-        modulePointers, pipelineElfs, shaderProps, moduleCallsTraceRay, results, &rtContext, this, false, false};
-    helperThreadProvider->SetTasks(&helperThreadBuildRayTracingPipelineElf, newModules.size(),
-                                   static_cast<void *>(&helperThreadPayload));
-
-    unsigned moduleIndex = 0;
-    // Initial increase to skip entry module, it will be handled later.
-    helperThreadProvider->GetNextTask(&moduleIndex);
-    helperThreadProvider->TaskCompleted();
-
-    std::vector<std::thread> workers(cl::AddRtHelpers);
-    for (std::thread &worker : workers) {
-      worker = std::thread([&helperThreadProvider, &helperThreadPayload] {
-        helperThreadBuildRayTracingPipelineElf(helperThreadProvider, &helperThreadPayload);
-      });
-    }
-
-    while (!helperThreadPayload.helperThreadJoined && helperThreadProvider->GetNextTask(&moduleIndex)) {
-      // NOTE: When a helper thread joins, it will move modules from the original context into a new one. However,
-      // main thread may be processing on the original context at the same time, results in out of sync situation.
-      // Here we keep main thread working on the original context until helper thread joins, to reduce the cost of
-      // initializing new context and copying modules. Once helper thread has joined, main thread must switch to a new
-      // context.
-      results[moduleIndex] =
-          buildRayTracingPipelineElf(mainContext, std::move(newModules[moduleIndex]), pipelineElfs[moduleIndex],
-                                     shaderProps, moduleCallsTraceRay, moduleIndex, pipeline, timerProfiler);
-      helperThreadProvider->TaskCompleted();
-    }
-
-    if (helperThreadPayload.helperThreadJoined) {
-      // Tasks may not finished but helper thread joined, need to switch to new context and notify helper thread to
-      // proceed.
-      helperThreadPayload.mainThreadSwitchedContext = true;
-      m_helperThreadConditionVariable.notify_all();
-      helperThreadBuildRayTracingPipelineElf(helperThreadProvider, &helperThreadPayload);
-    }
-    helperThreadProvider->WaitForTasks();
-
-    for (std::thread &worker : workers)
-      worker.join();
-
-    for (auto res : results) {
-      if (res != Result::Success)
-        return Result::ErrorInvalidShader;
-    }
+    HelperContext(Context *context, LgcContext *builderContext, std::unique_ptr<Pipeline> pipeline)
+        : context(context), builderContext(builderContext), pipeline(std::move(pipeline)),
+          timerProfiler(context->getPipelineHashCode(), "LLPC", TimerProfiler::PipelineTimerEnableMask) {}
+  };
 
-  } else {
-    for (auto [moduleIndex, module] : llvm::enumerate(newModules)) {
-      // Skip entry module here, it will be handled later.
-      if (moduleIndex == 0)
-        continue;
+  if (Error err = parallelForWithContext<HelperContext>(
+          cl::AddRtHelpers, helperThreadProvider, newModules.size(), HelperThreadExclusion::Task,
+          [this, &rtContext]() -> std::unique_ptr<HelperContext> {
+            Context *context = acquireContext();
+            context->attachPipelineContext(&rtContext);
+
+            LgcContext *builderContext = context->getLgcContext();
+            std::unique_ptr<Pipeline> pipeline(builderContext->createPipeline());
+            rtContext.setPipelineState(&*pipeline, /*hasher=*/nullptr, false);
+            context->setBuilder(builderContext->createBuilder(&*pipeline));
+
+            auto ctx = std::make_unique<HelperContext>(context, builderContext, std::move(pipeline));
+            ctx->context->setDiagnosticHandler(std::make_unique<LlpcDiagnosticHandler>(&ctx->hasError));
+
+            ctx->context->ensureGpurtLibrary();
+            setUseGpurt(&*ctx->pipeline);
+
+            return ctx;
+          },
+          [this, &newModules, &pipelineElfs, &shaderProps, &moduleCallsTraceRay, &mainContext, &pipeline,
+           &timerProfiler, &hasError](size_t moduleIndex, HelperContext *ctx) -> Error {
+            // Skip entry module here, it will be handled later.
+            if (moduleIndex == 0)
+              return Error::success();
+
+            std::unique_ptr<Module> module;
+
+            if (!ctx) {
+              module = std::move(newModules[moduleIndex]);
+            } else {
+              // NOTE: All modules were in the same LLVMContext, which is not thread safe. We need to 'clone' the module
+              // into a separate context here to ensure we can do the work simultaneously. We achieve this by outputting
+              // the module as bitcode and read it back in another context.
+
+              // FIXME: There will be out of sync assertion when non-trivial work happens on the main context (probably
+              // in PipelineState::generate) while the helper thread is using the bitcode writer. It would be great to
+              // find a decent solution for such a situation.
+              //
+              // We must not destroy the original module here, as that can cause mutation of cross-module structures
+              // associated to the LLVMContext. It will be destroyed on the main thread when it goes out of scope.
+              SmallVector<char, 0> bcBuffer;
+              BitcodeWriter bcWriter(bcBuffer);
+              bcWriter.writeModule(*newModules[moduleIndex]);
+              bcWriter.writeSymtab();
+              bcWriter.writeStrtab();
+
+              SmallVectorMemoryBuffer bcMemBuf(std::move(bcBuffer), newModules[moduleIndex]->getName());
+              auto moduleOrErr = getLazyBitcodeModule(std::move(bcMemBuf), *ctx->context);
+              if (Error err = moduleOrErr.takeError()) {
+                LLPC_ERRS("Failed to load bit code\n");
+                return err;
+              }
+
+              module = std::move(*moduleOrErr);
+              if (Error err = module->materializeAll()) {
+                LLPC_ERRS("Failed to materialize module\n");
+                return err;
+              }
+            }
 
-      Result result = buildRayTracingPipelineElf(mainContext, std::move(module), pipelineElfs[moduleIndex], shaderProps,
-                                                 moduleCallsTraceRay, moduleIndex, pipeline, timerProfiler);
-      if (result != Result::Success)
-        return result;
-    }
-  }
+            Context *context = ctx ? ctx->context : mainContext;
+            Pipeline *ourPipeline = ctx ? &*ctx->pipeline : &*pipeline;
+            TimerProfiler *ourTimerProfiler = ctx ? &ctx->timerProfiler : &timerProfiler;
+
+            Result result =
+                buildRayTracingPipelineElf(context, std::move(module), pipelineElfs[moduleIndex], shaderProps,
+                                           moduleCallsTraceRay, moduleIndex, *ourPipeline, *ourTimerProfiler);
+            if (result == Result::Success && (ctx ? ctx->hasError : hasError))
+              result = Result::ErrorInvalidShader;
+
+            return resultToError(result, "building raytracing pipeline ELF");
+          },
+          [this](std::unique_ptr<HelperContext> ctx) {
+            ctx->context->setDiagnosticHandler(nullptr);
+            releaseContext(ctx->context);
+          }))
+    return reportError(std::move(err), Result::ErrorInvalidShader);
 
   // Build traversal at last after we gather all needed information.
   if (traversalModule) {
@@ -3355,7 +3355,7 @@ Result Compiler::buildRayTracingPipelineInternal(RayTracingContext &rtContext,
 
     Result result =
         buildRayTracingPipelineElf(mainContext, std::move(traversalModule), pipelineElfs[newModules.size()],
-                                   shaderProps, moduleCallsTraceRay, newModules.size(), pipeline, timerProfiler);
+                                   shaderProps, moduleCallsTraceRay, newModules.size(), *pipeline, timerProfiler);
     if (result != Result::Success)
       return result;
   }
@@ -3368,7 +3368,7 @@ Result Compiler::buildRayTracingPipelineInternal(RayTracingContext &rtContext,
 #endif
   if (needEntry) {
     Result result = buildRayTracingPipelineElf(mainContext, std::move(entry), pipelineElfs[0], shaderProps,
-                                               moduleCallsTraceRay, 0, pipeline, timerProfiler);
+                                               moduleCallsTraceRay, 0, *pipeline, timerProfiler);
     if (result != Result::Success)
       return result;
 
diff --git a/llpc/context/llpcCompiler.h b/llpc/context/llpcCompiler.h
index d020769684..95ef89abab 100644
--- a/llpc/context/llpcCompiler.h
+++ b/llpc/context/llpcCompiler.h
@@ -134,6 +134,9 @@ class Compiler : public ICompiler {
                                          RayTracingPipelineBuildOut *pipelineOut, void *pipelineDumpFile = nullptr,
                                          IHelperThreadProvider *pHelperThreadProvider = nullptr);
 
+  Result buildTransformVertexShader(Context *context, const PipelineShaderInfo *shaderInfo,
+                                    llvm::raw_pwrite_stream &outStream);
+
   Result buildGraphicsPipelineInternal(GraphicsContext *graphicsContext,
                                        llvm::ArrayRef<const PipelineShaderInfo *> shaderInfo,
                                        bool buildingRelocatableElf, ElfPackage *pipelineElf,
@@ -175,9 +178,8 @@ class Compiler : public ICompiler {
   Result buildRayTracingPipelineElf(Context *context, std::unique_ptr<llvm::Module> module, ElfPackage &pipelineElf,
                                     std::vector<Vkgc::RayTracingShaderProperty> &shaderProps,
                                     std::vector<bool> &moduleCallsTraceRay, unsigned moduleIndex,
-                                    std::unique_ptr<lgc::Pipeline> &pipeline, TimerProfiler &timerProfiler);
+                                    lgc::Pipeline &pipeline, TimerProfiler &timerProfiler);
   llvm::sys::Mutex &getHelperThreadMutex() { return m_helperThreadMutex; }
-  std::condition_variable_any &getHelperThreadConditionVariable() { return m_helperThreadConditionVariable; }
 
   void setUseGpurt(lgc::Pipeline *pipeline);
 
@@ -217,8 +219,6 @@ class Compiler : public ICompiler {
   static std::vector<Context *> *m_contextPool; // Context pool
   unsigned m_relocatablePipelineCompilations;   // The number of pipelines compiled using relocatable shader elf
   static llvm::sys::Mutex m_helperThreadMutex;  // Mutex for helper thread
-  static std::condition_variable_any m_helperThreadConditionVariable; // Condition variable used by helper thread to
-                                                                      // wait for main thread switching context
 
   void buildShaderModuleResourceUsage(
       const ShaderModuleBuildInfo *shaderInfo, SPIRV::SPIRVModule *module, Vkgc::ResourcesNodes &resourcesNodes,
diff --git a/llpc/context/llpcComputeContext.cpp b/llpc/context/llpcComputeContext.cpp
index 8f5ff5e4b6..60aa36f50f 100644
--- a/llpc/context/llpcComputeContext.cpp
+++ b/llpc/context/llpcComputeContext.cpp
@@ -46,8 +46,10 @@ namespace Llpc {
 // @param pipelineHash : Pipeline hash code
 // @param cacheHash : Cache hash code
 ComputeContext::ComputeContext(GfxIpVersion gfxIp, const char *apiName, const ComputePipelineBuildInfo *pipelineInfo,
-                               MetroHash::Hash *pipelineHash, MetroHash::Hash *cacheHash)
-    : PipelineContext(gfxIp, apiName, pipelineHash, cacheHash), m_pipelineInfo(pipelineInfo) {
+                               const StringRef vertexShaderStream, MetroHash::Hash *pipelineHash,
+                               MetroHash::Hash *cacheHash)
+    : PipelineContext(gfxIp, apiName, pipelineHash, cacheHash), m_pipelineInfo(pipelineInfo),
+      m_vertexShaderStream(vertexShaderStream) {
   const Vkgc::BinaryData *gpurtShaderLibrary = nullptr;
 #if LLPC_CLIENT_INTERFACE_MAJOR_VERSION < 62
   gpurtShaderLibrary = &pipelineInfo->shaderLibrary;
@@ -82,6 +84,20 @@ void ComputeContext::setPipelineState(lgc::Pipeline *pipeline, Util::MetroHash64
 
   if (pipeline)
     pipeline->setShaderOptions(lgc::ShaderStage::Compute, computeShaderOptions(m_pipelineInfo->cs));
+  if (m_pipelineInfo->transformGraphicsPipeline != nullptr) {
+    auto gfxBuildInfo =
+        static_cast<const ComputePipelineBuildInfo *>(getPipelineBuildInfo())->transformGraphicsPipeline;
+    setVertexInputDescriptions(pipeline, gfxBuildInfo, hasher);
+
+    lgc::InputAssemblyState iaState = {};
+    lgc::RasterizerState rsState = {};
+
+    auto iaStateBuildInfo = m_pipelineInfo->transformGraphicsPipeline->iaState;
+    iaState.disableVertexReuse = iaStateBuildInfo.disableVertexReuse;
+    iaState.useVertexBufferDescArray = iaStateBuildInfo.useVertexBufferDescArray;
+
+    pipeline->setGraphicsState(iaState, rsState);
+  }
 }
 
 // =====================================================================================================================
diff --git a/llpc/context/llpcComputeContext.h b/llpc/context/llpcComputeContext.h
index 9da9ade4a6..cf73f272cd 100644
--- a/llpc/context/llpcComputeContext.h
+++ b/llpc/context/llpcComputeContext.h
@@ -39,7 +39,7 @@ namespace Llpc {
 class ComputeContext : public PipelineContext {
 public:
   ComputeContext(GfxIpVersion gfxIp, const char *apiName, const ComputePipelineBuildInfo *pipelineInfo,
-                 MetroHash::Hash *pipelineHash, MetroHash::Hash *cacheHash);
+                 const llvm::StringRef vertexShaderStream, MetroHash::Hash *pipelineHash, MetroHash::Hash *cacheHash);
   virtual ~ComputeContext() = default;
 
   virtual PipelineType getPipelineType() const override { return PipelineType::Compute; }
@@ -67,12 +67,16 @@ class ComputeContext : public PipelineContext {
   // Gets client-defined metadata
   virtual llvm::StringRef getClientMetadata() const override;
 
+  // Get transform vertex shader module bit-code
+  const llvm::StringRef getVtxShaderStream() const { return m_vertexShaderStream; }
+
 private:
   ComputeContext() = delete;
   ComputeContext(const ComputeContext &) = delete;
   ComputeContext &operator=(const ComputeContext &) = delete;
 
   const ComputePipelineBuildInfo *m_pipelineInfo; // Info to build a compute pipeline
+  llvm::StringRef m_vertexShaderStream;           // llvm bitcode for vertex shader module
 };
 
 } // namespace Llpc
diff --git a/llpc/context/llpcGraphicsContext.cpp b/llpc/context/llpcGraphicsContext.cpp
index c736b8014f..c7f79a7b2b 100644
--- a/llpc/context/llpcGraphicsContext.cpp
+++ b/llpc/context/llpcGraphicsContext.cpp
@@ -194,7 +194,8 @@ void GraphicsContext::setPipelineState(Pipeline *pipeline, Util::MetroHash64 *ha
 
   if ((stageMask & ~shaderStageToMask(ShaderStageFragment))) {
     // Set vertex input descriptions to the middle-end.
-    setVertexInputDescriptions(pipeline, hasher);
+    auto gfxBuildInfo = static_cast<const GraphicsPipelineBuildInfo *>(getPipelineBuildInfo());
+    setVertexInputDescriptions(pipeline, gfxBuildInfo, hasher);
   }
 
   if ((isShaderStageInMask(ShaderStageFragment, stageMask) && (!unlinked || DisableColorExportShader)) ||
@@ -343,90 +344,6 @@ void GraphicsContext::setColorExportState(Pipeline *pipeline, Util::MetroHash64
   pipeline->setColorExportState(formats, state);
 }
 
-// =====================================================================================================================
-// Set vertex input descriptions in middle-end Pipeline object, or hash them.
-//
-// @param [in/out] pipeline : Middle-end pipeline object; nullptr if only hashing
-// @param [in/out] hasher : Hasher object; nullptr if only setting LGC pipeline state
-void GraphicsContext::setVertexInputDescriptions(Pipeline *pipeline, Util::MetroHash64 *hasher) const {
-  auto vertexInput = static_cast<const GraphicsPipelineBuildInfo *>(getPipelineBuildInfo())->pVertexInput;
-  if (!vertexInput)
-    return;
-
-  if (hasher) {
-    PipelineDumper::updateHashForVertexInputState(
-        vertexInput, static_cast<const GraphicsPipelineBuildInfo *>(getPipelineBuildInfo())->dynamicVertexStride,
-        hasher);
-  }
-  if (!pipeline)
-    return; // Only hashing.
-
-  // Gather the bindings.
-  SmallVector<VertexInputDescription, 8> bindings;
-  for (unsigned i = 0; i < vertexInput->vertexBindingDescriptionCount; ++i) {
-    auto binding = &vertexInput->pVertexBindingDescriptions[i];
-    unsigned idx = binding->binding;
-    if (idx >= bindings.size())
-      bindings.resize(idx + 1);
-    bindings[idx].binding = binding->binding;
-    bindings[idx].stride = binding->stride;
-    switch (binding->inputRate) {
-    case VK_VERTEX_INPUT_RATE_VERTEX:
-      bindings[idx].inputRate = VertexInputRateVertex;
-      break;
-    case VK_VERTEX_INPUT_RATE_INSTANCE:
-      bindings[idx].inputRate = VertexInputRateInstance;
-      bindings[idx].divisor = 1; // Set default divisor
-      break;
-    default:
-      llvm_unreachable("Should never be called!");
-    }
-  }
-
-  // Check for divisors.
-  auto vertexDivisor = findVkStructInChain<VkPipelineVertexInputDivisorStateCreateInfoEXT>(
-      VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT, vertexInput->pNext);
-  if (vertexDivisor) {
-    for (unsigned i = 0; i < vertexDivisor->vertexBindingDivisorCount; ++i) {
-      auto divisor = &vertexDivisor->pVertexBindingDivisors[i];
-      if (divisor->binding <= bindings.size())
-        bindings[divisor->binding].divisor = divisor->divisor;
-    }
-  }
-
-  // Gather the vertex inputs.
-  SmallVector<VertexInputDescription, 8> descriptions;
-  auto vbLowBits =
-      static_cast<const GraphicsPipelineBuildInfo *>(getPipelineBuildInfo())->getGlState().vbAddressLowBits;
-  auto vbAddressLowBitsKnown =
-      static_cast<const GraphicsPipelineBuildInfo *>(getPipelineBuildInfo())->getGlState().vbAddressLowBitsKnown;
-  for (unsigned i = 0; i < vertexInput->vertexAttributeDescriptionCount; ++i) {
-    auto attrib = &vertexInput->pVertexAttributeDescriptions[i];
-    if (attrib->binding >= bindings.size())
-      continue;
-    auto binding = &bindings[attrib->binding];
-    if (binding->binding != attrib->binding)
-      continue;
-
-    auto dfmt = BufDataFormatInvalid;
-    auto nfmt = BufNumFormatUnorm;
-    std::tie(dfmt, nfmt) = mapVkFormat(attrib->format, /*isColorExport=*/false);
-    const uint8_t vbOffsetLowBits = vbAddressLowBitsKnown ? vbLowBits[attrib->binding] : 0;
-
-    if (dfmt != BufDataFormatInvalid) {
-      descriptions.push_back(
-          {attrib->location, attrib->binding, attrib->offset,
-           (static_cast<const GraphicsPipelineBuildInfo *>(getPipelineBuildInfo())->dynamicVertexStride
-                ? 0
-                : binding->stride),
-           dfmt, nfmt, binding->inputRate, binding->divisor, vbOffsetLowBits});
-    }
-  }
-
-  // Give the vertex input descriptions to the middle-end Pipeline object.
-  pipeline->setVertexInputDescriptions(descriptions);
-}
-
 // =====================================================================================================================
 // Give the graphics pipeline state to the middle-end, and/or hash it. If stageMask has no pre-rasterization shader
 // stages, do not consider pre-rasterization pipeline state. If stageMask has no FS, do not consider FS state.
diff --git a/llpc/context/llpcGraphicsContext.h b/llpc/context/llpcGraphicsContext.h
index 94a10d42cc..675fa12a13 100644
--- a/llpc/context/llpcGraphicsContext.h
+++ b/llpc/context/llpcGraphicsContext.h
@@ -90,9 +90,6 @@ class GraphicsContext : public PipelineContext {
   void setColorExportState(lgc::Pipeline *pipeline, Util::MetroHash64 *hasher,
                            bool disableDualSourceBlend = false) const;
 
-  // Set vertex input descriptions in middle-end Pipeline, and/or hash them.
-  void setVertexInputDescriptions(lgc::Pipeline *pipeline, Util::MetroHash64 *hasher) const;
-
   // Give the graphics pipeline state to the middle-end, and/or hash it.
   void setGraphicsStateInPipeline(lgc::Pipeline *pipeline, Util::MetroHash64 *hasher, unsigned stageMask) const;
 
diff --git a/llpc/context/llpcPipelineContext.cpp b/llpc/context/llpcPipelineContext.cpp
index ba52048883..42976ba2d2 100644
--- a/llpc/context/llpcPipelineContext.cpp
+++ b/llpc/context/llpcPipelineContext.cpp
@@ -325,6 +325,86 @@ Options PipelineContext::computePipelineOptions() const {
   return options;
 }
 
+// =====================================================================================================================
+// Set vertex input descriptions in middle-end Pipeline object, or hash them.
+//
+// @param [in/out] pipeline : Middle-end pipeline object; nullptr if only hashing
+// @param [in/out] gfxBuildInfo : graphics pipeline build info
+// @param [in/out] hasher : Hasher object; nullptr if only setting LGC pipeline state
+void PipelineContext::setVertexInputDescriptions(Pipeline *pipeline, const GraphicsPipelineBuildInfo *gfxBuildInfo,
+                                                 Util::MetroHash64 *hasher) const {
+  if (!gfxBuildInfo || !pipeline)
+    return;
+
+  auto vertexInput = gfxBuildInfo->pVertexInput;
+  if (!vertexInput)
+    return;
+
+  if (hasher) {
+    PipelineDumper::updateHashForVertexInputState(vertexInput, gfxBuildInfo->dynamicVertexStride, hasher);
+  }
+
+  // Gather the bindings.
+  SmallVector<VertexInputDescription, 8> bindings;
+  for (unsigned i = 0; i < vertexInput->vertexBindingDescriptionCount; ++i) {
+    auto binding = &vertexInput->pVertexBindingDescriptions[i];
+    unsigned idx = binding->binding;
+    if (idx >= bindings.size())
+      bindings.resize(idx + 1);
+    bindings[idx].binding = binding->binding;
+    bindings[idx].stride = binding->stride;
+    switch (binding->inputRate) {
+    case VK_VERTEX_INPUT_RATE_VERTEX:
+      bindings[idx].inputRate = VertexInputRateVertex;
+      break;
+    case VK_VERTEX_INPUT_RATE_INSTANCE:
+      bindings[idx].inputRate = VertexInputRateInstance;
+      bindings[idx].divisor = 1; // Set default divisor
+      break;
+    default:
+      llvm_unreachable("Should never be called!");
+    }
+  }
+
+  // Check for divisors.
+  auto vertexDivisor = findVkStructInChain<VkPipelineVertexInputDivisorStateCreateInfoEXT>(
+      VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT, vertexInput->pNext);
+  if (vertexDivisor) {
+    for (unsigned i = 0; i < vertexDivisor->vertexBindingDivisorCount; ++i) {
+      auto divisor = &vertexDivisor->pVertexBindingDivisors[i];
+      if (divisor->binding <= bindings.size())
+        bindings[divisor->binding].divisor = divisor->divisor;
+    }
+  }
+
+  // Gather the vertex inputs.
+  SmallVector<VertexInputDescription, 8> descriptions;
+  auto vbLowBits = gfxBuildInfo->getGlState().vbAddressLowBits;
+  auto vbAddressLowBitsKnown = gfxBuildInfo->getGlState().vbAddressLowBitsKnown;
+  for (unsigned i = 0; i < vertexInput->vertexAttributeDescriptionCount; ++i) {
+    auto attrib = &vertexInput->pVertexAttributeDescriptions[i];
+    if (attrib->binding >= bindings.size())
+      continue;
+    auto binding = &bindings[attrib->binding];
+    if (binding->binding != attrib->binding)
+      continue;
+
+    auto dfmt = BufDataFormatInvalid;
+    auto nfmt = BufNumFormatUnorm;
+    std::tie(dfmt, nfmt) = mapVkFormat(attrib->format, /*isColorExport=*/false);
+    const uint8_t vbOffsetLowBits = vbAddressLowBitsKnown ? vbLowBits[attrib->binding] : 0;
+
+    if (dfmt != BufDataFormatInvalid) {
+      descriptions.push_back({attrib->location, attrib->binding, attrib->offset,
+                              (gfxBuildInfo->dynamicVertexStride ? 0 : binding->stride), dfmt, nfmt, binding->inputRate,
+                              binding->divisor, vbOffsetLowBits});
+    }
+  }
+
+  // Give the vertex input descriptions to the middle-end Pipeline object.
+  pipeline->setVertexInputDescriptions(descriptions);
+}
+
 // =====================================================================================================================
 // Give the user data nodes and descriptor range values to the middle-end.
 // The user data nodes have been merged so they are the same in each shader stage. Get them from
@@ -714,6 +794,7 @@ ShaderOptions PipelineContext::computeShaderOptions(const PipelineShaderInfo &sh
       static_cast<InvariantLoadsOption>(shaderInfo.options.aggressiveInvariantLoads);
   shaderOptions.viewIndexFromDeviceIndex = shaderInfo.options.viewIndexFromDeviceIndex;
 
+  shaderOptions.forceUnderflowPrevention = shaderInfo.options.forceUnderflowPrevention;
   return shaderOptions;
 }
 
diff --git a/llpc/context/llpcPipelineContext.h b/llpc/context/llpcPipelineContext.h
index b30ba607ed..d268076a3f 100644
--- a/llpc/context/llpcPipelineContext.h
+++ b/llpc/context/llpcPipelineContext.h
@@ -243,6 +243,10 @@ class PipelineContext {
   // Give the pipeline options to the middle-end, and/or hash them.
   virtual lgc::Options computePipelineOptions() const;
 
+  // Set vertex input descriptions in middle-end Pipeline, and/or hash them.
+  void setVertexInputDescriptions(lgc::Pipeline *pipeline, const GraphicsPipelineBuildInfo *gfxBuildInfo,
+                                  Util::MetroHash64 *hasher) const;
+
   GfxIpVersion m_gfxIp;                  // Graphics IP version info
   const char *m_apiName;                 // API name from client, "Vulkan" or "OpenGL"
   MetroHash::Hash m_pipelineHash;        // Pipeline hash code
diff --git a/llpc/context/llpcRayTracingContext.cpp b/llpc/context/llpcRayTracingContext.cpp
index 84e639fe4b..f0526bfefe 100644
--- a/llpc/context/llpcRayTracingContext.cpp
+++ b/llpc/context/llpcRayTracingContext.cpp
@@ -105,8 +105,11 @@ void RayTracingContext::collectBuiltIn(unsigned builtIn) {
 // @param type : Payload type
 // @param dataLayout : Payload module data layout
 void RayTracingContext::collectPayloadSize(llvm::Type *type, const DataLayout &dataLayout) {
-  unsigned payloadTypeSize = alignTo(dataLayout.getTypeAllocSize(type), 4);
-  m_rtLibSummary.maxRayPayloadSize = std::max(m_rtLibSummary.maxRayPayloadSize, payloadTypeSize);
+  // Workaround for Proton games that use a dynamically determined payload size instead of the declared payload size.
+  if (getRayTracingPipelineBuildInfo()->rtIgnoreDeclaredPayloadSize == false) {
+    unsigned payloadTypeSize = alignTo(dataLayout.getTypeAllocSize(type), 4);
+    m_rtLibSummary.maxRayPayloadSize = std::max(m_rtLibSummary.maxRayPayloadSize, payloadTypeSize);
+  }
 }
 
 // =====================================================================================================================
diff --git a/llpc/lowering/LinkTransformShaders.cpp b/llpc/lowering/LinkTransformShaders.cpp
new file mode 100644
index 0000000000..c04320eef0
--- /dev/null
+++ b/llpc/lowering/LinkTransformShaders.cpp
@@ -0,0 +1,139 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+/**
+ ***********************************************************************************************************************
+ * @file  LinkTransformShaders.cpp
+ * @brief Link a prepared vertex shader into a transform compute shader.
+ ***********************************************************************************************************************
+ */
+#include "LinkTransformShaders.h"
+#include "LoweringUtil.h"
+#include "llpcComputeContext.h"
+#include "llpcContext.h"
+#include "compilerutils/CompilerUtils.h"
+#include "lgc/Builder.h"
+#include "llvm/Bitcode/BitcodeReader.h"
+
+#define DEBUG_TYPE "link-transform-shader"
+
+using namespace lgc;
+using namespace llvm;
+using namespace Llpc;
+
+namespace Llpc {
+
+static const char TransformVsEntry[] = "TransformVertexEntry";
+static const char TransformVertex[] = "TransformVertexAmd";
+static const char GetTransformVertexAttribute[] = "GetTransformVertexAttributeAmd";
+
+// =====================================================================================================================
+// Executes this SPIR-V lowering pass on the specified LLVM module.
+//
+// @param [in/out] module : LLVM module to be run on
+// @param [in/out] analysisManager : Analysis manager to use for this transformation
+PreservedAnalyses LinkTransformShaders::run(Module &module, ModuleAnalysisManager &analysisManager) {
+  SpirvLower::init(&module);
+  LLVM_DEBUG(dbgs() << "Run the pass Lower-transform-shader\n");
+  processTransformCsFunctions(module);
+  return PreservedAnalyses::none();
+}
+
+// =====================================================================================================================
+// traverse the functions in the module, call the TransformVertexEntry to overwrite the predefined functions in
+// compute shader
+//
+// @param [in/out] module : LLVM module to be run on
+void LinkTransformShaders::processTransformCsFunctions(Module &module) {
+  auto llpcContext = static_cast<Context *>(&module.getContext());
+  auto computeContext = static_cast<ComputeContext *>(llpcContext->getPipelineContext());
+  assert(computeContext != nullptr);
+
+  // The bitcode of transform vertex shader is stored in computeContext, convert it to llvm IR before being linked
+  std::unique_ptr<Module> vtxShaderModule;
+  if (computeContext != nullptr) {
+    auto vtxShaderStream = computeContext->getVtxShaderStream();
+    MemoryBufferRef bcBufferRef(vtxShaderStream, "");
+    Expected<std::unique_ptr<Module>> moduleOrErr = parseBitcodeFile(bcBufferRef, *llpcContext);
+    if (!moduleOrErr)
+      report_fatal_error("Failed to read bitcode");
+    vtxShaderModule = std::move(*moduleOrErr);
+
+    // After translate LLVM IR to bitcode, the module ID is disappeared, need to set it explicitly
+    vtxShaderModule->setModuleIdentifier("transform-runtime");
+  }
+
+  // Link the gfxruntime library module
+  auto *transformVsEntry = vtxShaderModule->getFunction(TransformVsEntry);
+  for (auto funcIt = module.begin(), funcEnd = module.end(); funcIt != funcEnd;) {
+    Function *func = &*funcIt++;
+    auto funcName = func->getName();
+    if (funcName.starts_with(GetTransformVertexAttribute)) {
+      // Handle VS output for object selection and feedback, the function GetTransformVertexAttribute will be
+      // overwritten All the VS built-in outputs will be required for selection and feedback
+      processLibraryFunction(func, transformVsEntry, false);
+    } else if (funcName.starts_with(TransformVertex)) {
+      // Handle VS output for primitive culling, the function TransformVertexAmd will be overwritten, only gl_Position
+      // will be required for primitive culling
+      processLibraryFunction(func, transformVsEntry, true);
+    }
+  }
+}
+
+// =====================================================================================================================
+// Call the shader library to overwrite the predefined function TransformVertexAmd or GetTransformVertexAttribute
+//
+// @param func : The function to process in transform compute shader
+// @param transformVsFunc : The function to be called in transform vertex shader
+// @param primCulling : Whether the function is called for primitive culling
+void LinkTransformShaders::processLibraryFunction(Function *&func, Function *transformVsFunc, bool primCulling) {
+  m_builder->SetInsertPoint(clearBlock(func));
+
+  // Cross module inliner cannot be used to inline a function with multiple blocks into in a degenerate block, create
+  // a temporary terminator first.
+  auto tempTerminator = m_builder->CreateUnreachable();
+  m_builder->SetInsertPoint(tempTerminator);
+
+  SmallVector<Value *> args;
+  auto int32Ty = m_builder->getInt32Ty();
+  for (auto &arg : func->args()) {
+    Value *v = m_builder->CreateLoad(int32Ty, &arg);
+    args.push_back(v);
+  }
+
+  CompilerUtils::CrossModuleInliner inliner;
+  auto *vsOutput = inliner.inlineCall(*m_builder, transformVsFunc, {args}).returnValue;
+
+  if (primCulling) {
+    // For primitive culling, return gl_Position
+    m_builder->CreateRet(m_builder->CreateExtractValue(vsOutput, 0));
+    tempTerminator->eraseFromParent();
+  } else {
+    // For selection and feedback, return all the required built-in outputs
+    m_builder->CreateRet(vsOutput);
+    tempTerminator->eraseFromParent();
+  }
+}
+
+} // namespace Llpc
diff --git a/llpc/lowering/LinkTransformShaders.h b/llpc/lowering/LinkTransformShaders.h
new file mode 100644
index 0000000000..09af81f227
--- /dev/null
+++ b/llpc/lowering/LinkTransformShaders.h
@@ -0,0 +1,49 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+/**
+ ***********************************************************************************************************************
+ * @file  LinkTransformShaders.h
+ * @brief LLPC header file: contains declaration of Llpc::LinkTransformShaders
+ ***********************************************************************************************************************
+ */
+#pragma once
+
+#include "Lowering.h"
+#include "llvm/IR/PassManager.h"
+
+namespace Llpc {
+
+class LinkTransformShaders : public SpirvLower, public llvm::PassInfoMixin<LinkTransformShaders> {
+public:
+  LinkTransformShaders() {}
+  llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager);
+  static llvm::StringRef name() { return "Lower transform shader"; }
+
+private:
+  void processTransformCsFunctions(llvm::Module &module);
+  void processLibraryFunction(llvm::Function *&func, llvm::Function *transformVsFunc, bool primCulling);
+};
+
+} // namespace Llpc
diff --git a/llpc/lowering/LowerAccessChain.cpp b/llpc/lowering/LowerAccessChain.cpp
index 5335ccb4ce..8f6cc8ecb8 100644
--- a/llpc/lowering/LowerAccessChain.cpp
+++ b/llpc/lowering/LowerAccessChain.cpp
@@ -266,7 +266,7 @@ Instruction *LowerAccessChain::tryToCoalesceChain(Instruction *getElemPtr) {
   // Create the coalesced "getelementptr" instruction (do combining)
   auto *coalescedGetElemPtr =
       isCustomGep ? cast<Instruction>(m_builder->create<StructuralGepOp>(basePtr, coalescedType, false, indices))
-                  : cast<Instruction>(GetElementPtrInst::Create(coalescedType, basePtr, indices, "", getElemPtr));
+                  : cast<Instruction>(m_builder->CreateGEP(coalescedType, basePtr, indices));
   getElemPtr->replaceAllUsesWith(coalescedGetElemPtr);
 
   // Remove dead "getelementptr" instructions where possible.
diff --git a/llpc/lowering/LowerGlCompatibility.cpp b/llpc/lowering/LowerGlCompatibility.cpp
index 37982f849e..7dc1c32a37 100644
--- a/llpc/lowering/LowerGlCompatibility.cpp
+++ b/llpc/lowering/LowerGlCompatibility.cpp
@@ -48,8 +48,8 @@ namespace Llpc {
 
 // =====================================================================================================================
 LowerGlCompatibility::LowerGlCompatibility()
-    : m_retInst(nullptr), m_entryPointEnd(nullptr), m_originalEntryBlock(nullptr), m_out(nullptr),
-      m_clipVertex(nullptr), m_clipDistance(nullptr), m_clipPlane(nullptr), m_frontColor(nullptr), m_backColor(nullptr),
+    : m_retInst(nullptr), m_entryPointEnd(nullptr), m_originalEntryBlock(nullptr), m_clipVertex(nullptr),
+      m_clipDistance(nullptr), m_clipPlane(nullptr), m_frontColor(nullptr), m_backColor(nullptr),
       m_frontSecondaryColor(nullptr), m_backSecondaryColor(nullptr), m_color(nullptr), m_secondaryColor(nullptr),
       m_frontFacing(nullptr), m_patchTexCoord(nullptr), m_fragColor(nullptr), m_fragDepth(), m_fragStencilRef() {
 }
@@ -63,14 +63,9 @@ PreservedAnalyses LowerGlCompatibility::run(Module &module, ModuleAnalysisManage
   SpirvLower::init(&module);
   LLVM_DEBUG(dbgs() << "Run the pass Lower-gl-compatibility\n");
 
-  if (!needRun())
-    return PreservedAnalyses::all();
-
   collectEmulationResource();
 
-  if (!needLowerClipVertex() && !needLowerFrontColor() && !needLowerBackColor() && !needLowerFrontSecondaryColor() &&
-      !needLowerBackSecondaryColor() && !needEmulateDrawPixels() && !needEmulateTwoSideLighting() &&
-      !needEmulateBitmap() && !needLowerFragColor() && !needEmulateSmoothStipple() && !needLowerAlphaTest())
+  if (!needRun())
     return PreservedAnalyses::all();
 
   buildPatchPositionInfo();
@@ -155,49 +150,22 @@ unsigned LowerGlCompatibility::getUniformLocation(llvm::GlobalVariable *var) {
 }
 
 // =====================================================================================================================
-// Get in/out meta data by indices from from aggregate type.
+// Retrieves metadata for shader input/output elements based on their type.
 //
-// @param [in]  valueTy : The metadata's embellish type.
-// @param [in]  mds     : The metadata constant of InOut Global variable to be decode.
-// @param [in]  index   : The the index of the metadata in the embellish type.
-// @param [out] out     : Use to output the element's metadatas of the InOut Global variable.
-void LowerGlCompatibility::decodeInOutMetaRecursivelyByIndex(llvm::Type *valueTy, llvm::Constant *mds,
-                                                             ArrayRef<Value *> index,
-                                                             llvm::SmallVector<ShaderInOutMetadata> &out) {
-  auto currentType = valueTy;
-  auto currentMds = mds;
-  if (!index.empty()) {
-    if (valueTy->isSingleValueType()) {
-      // Single type's metadata:{uint64, uint64}
-      assert(mds->getType() == StructType::get(*m_context, {m_builder->getInt64Ty(), m_builder->getInt64Ty()}));
-      ShaderInOutMetadata md = {};
-      md.U64All[0] = cast<ConstantInt>(mds->getOperand(0))->getZExtValue();
-      md.U64All[1] = cast<ConstantInt>(mds->getOperand(1))->getZExtValue();
-      out.push_back(md);
-    } else if (valueTy->isArrayTy()) {
-      assert(mds->getType()->getStructNumElements() == 4);
-      currentType = valueTy->getArrayElementType();
-      currentMds = cast<Constant>(mds->getOperand(1));
-      index = index.drop_front();
-      if (index.empty())
-        decodeInOutMetaRecursively(currentType, currentMds, out);
-      else {
-        decodeInOutMetaRecursivelyByIndex(currentType, currentMds, index, out);
-      }
-    } else if (valueTy->isStructTy()) {
-      // Structure type's metadata:[{element metadata type}, ...]
-      assert(valueTy->getStructNumElements() == mds->getType()->getStructNumElements());
-      auto opIdx = cast<ConstantInt>(index[0])->getZExtValue();
-      currentType = valueTy->getStructElementType(opIdx);
-      currentMds = cast<Constant>(mds->getOperand(opIdx));
-      index = index.drop_front();
-      if (index.empty())
-        decodeInOutMetaRecursively(currentType, currentMds, out);
-      else {
-        decodeInOutMetaRecursivelyByIndex(currentType, currentMds, index, out);
-      }
-    }
+// @param elementType : Type of the shader input/output element
+// @param elementMetadata : Metadata values for initializing the metadata structure
+ShaderInOutMetadata LowerGlCompatibility::getShaderInOutMetadata(Type *elementType, Constant *elementMetadata) {
+  ShaderInOutMetadata inOutMeta = {};
+  if (elementType->isArrayTy()) {
+    assert(elementMetadata->getNumOperands() == 4);
+    inOutMeta.U64All[0] = cast<ConstantInt>(elementMetadata->getOperand(2))->getZExtValue();
+    inOutMeta.U64All[1] = cast<ConstantInt>(elementMetadata->getOperand(3))->getZExtValue();
+  } else {
+    assert(elementMetadata->getNumOperands() == 2);
+    inOutMeta.U64All[0] = cast<ConstantInt>(elementMetadata->getOperand(0))->getZExtValue();
+    inOutMeta.U64All[1] = cast<ConstantInt>(elementMetadata->getOperand(1))->getZExtValue();
   }
+  return inOutMeta;
 }
 
 // =====================================================================================================================
@@ -252,151 +220,92 @@ void LowerGlCompatibility::collectEmitInst() {
 void LowerGlCompatibility::collectEmulationResource() {
   // Collect emulation information.
   for (auto &global : m_module->globals()) {
+    Type *valueType = global.getValueType();
+    // Note: The compatibility type structure or array of structures will be separated in this lowering pass
+    // by ScalarReplacementOfBuiltins. There are no types to handle in this lower compatibility pass.
+    if (valueType->isStructTy())
+      continue;
     if (global.getType()->getAddressSpace() == SPIRAS_Uniform && global.hasMetadata(gSPIRVMD::UniformConstant)) {
       if (getUniformLocation(&global) == Vkgc::GlCompatibilityUniformLocation::ClipPlane) {
         assert(m_clipPlane == nullptr);
         m_clipPlane = &global;
       }
     } else if (global.getType()->getAddressSpace() == SPIRAS_Input) {
-      llvm::SmallVector<ShaderInOutMetadata> mds;
       MDNode *metaNode = global.getMetadata(gSPIRVMD::InOut);
       assert(metaNode);
-      auto inOutMetaConst = mdconst::extract<Constant>(metaNode->getOperand(0));
-      auto valueType = global.getValueType();
-      bool isStructureOrArrayOfStructure =
-          (valueType->isStructTy() || (valueType->isArrayTy() && valueType->getArrayElementType()->isStructTy()));
-      decodeInOutMetaRecursively(valueType, inOutMetaConst, mds);
+      Constant *inOutMetaConst = mdconst::extract<Constant>(metaNode->getOperand(0));
+      ShaderInOutMetadata inOutMeta = getShaderInOutMetadata(valueType, inOutMetaConst);
+      unsigned builtInId = inOutMeta.Value;
+
       if (m_shaderStage == ShaderStageFragment) {
         // In fragment shader, gl_Color have same location with gl_FrontColor in pre-stage outputs.
         // gl_SecondaryColor have same location with gl_FrontSecondaryColor in pre-stage outputs.
         // So we can use location of gl_FrontColor and gl_FrontSecondaryColor to find gl_Color and gl_FrontColor
-        for (auto md : mds) {
-          if (md.IsLoc) {
-            if (md.Value == Vkgc::GlCompatibilityInOutLocation::FrontColor) {
-              if (isStructureOrArrayOfStructure)
-                m_out = &global;
-              else
-                m_color = &global;
-            }
-            if (md.Value == Vkgc::GlCompatibilityInOutLocation::FrontSecondaryColor) {
-              if (isStructureOrArrayOfStructure)
-                m_out = &global;
-              else
-                m_secondaryColor = &global;
-            }
-          }
+        if (inOutMeta.IsLoc) {
+          if (builtInId == Vkgc::GlCompatibilityInOutLocation::FrontColor)
+            m_color = &global;
+          else if (builtInId == Vkgc::GlCompatibilityInOutLocation::FrontSecondaryColor)
+            m_secondaryColor = &global;
         }
       }
     } else if (global.getType()->getAddressSpace() == SPIRAS_Output) {
-      llvm::SmallVector<ShaderInOutMetadata> mds;
       MDNode *metaNode = global.getMetadata(gSPIRVMD::InOut);
       assert(metaNode);
-      auto inOutMetaConst = mdconst::extract<Constant>(metaNode->getOperand(0));
-      auto valueType = global.getValueType();
-      bool isStructureOrArrayOfStructure =
-          (valueType->isStructTy() || (valueType->isArrayTy() && valueType->getArrayElementType()->isStructTy()));
-      assert(!isStructureOrArrayOfStructure);
+      Constant *inOutMetaConst = mdconst::extract<Constant>(metaNode->getOperand(0));
+      ShaderInOutMetadata inOutMeta = getShaderInOutMetadata(valueType, inOutMetaConst);
+      unsigned builtInId = inOutMeta.Value;
 
-      decodeInOutMetaRecursively(valueType, inOutMetaConst, mds);
       if (m_shaderStage == ShaderStageFragment) {
-        for (auto md : mds) {
-          if (md.IsBuiltIn) {
-            if (md.Value == spv::BuiltInFragDepth) {
-              m_fragDepth = &global;
-            }
-            if (md.Value == spv::BuiltInFragStencilRefEXT) {
-              m_fragStencilRef = &global;
-            }
-          } else {
-            assert(m_fragColor == nullptr);
+        if (inOutMeta.IsBuiltIn) {
+          if (builtInId == spv::BuiltInFragDepth)
+            m_fragDepth = &global;
+          else if (builtInId == spv::BuiltInFragStencilRefEXT)
+            m_fragStencilRef = &global;
+        } else {
+          if (builtInId == Vkgc::GlCompatibilityInOutLocation::SpecialFragOut)
             m_fragColor = &global;
-          }
-        }
-      }
-      for (auto md : mds) {
-        if (md.IsLoc) {
-          if (md.Value == Vkgc::GlCompatibilityInOutLocation::ClipVertex) {
-            m_clipVertex = &global;
-          }
-          if (md.Value == Vkgc::GlCompatibilityInOutLocation::FrontColor) {
-            if (isStructureOrArrayOfStructure)
-              m_out = &global;
-            else
-              m_frontColor = &global;
-          }
-          if (md.Value == Vkgc::GlCompatibilityInOutLocation::BackColor) {
-            if (isStructureOrArrayOfStructure)
-              m_out = &global;
-            else
-              m_backColor = &global;
-          }
-          if (md.Value == Vkgc::GlCompatibilityInOutLocation::FrontSecondaryColor) {
-            if (isStructureOrArrayOfStructure)
-              m_out = &global;
-            else
-              m_frontSecondaryColor = &global;
-          }
-          if (md.Value == Vkgc::GlCompatibilityInOutLocation::BackSecondaryColor) {
-            if (isStructureOrArrayOfStructure)
-              m_out = &global;
-            else
-              m_backSecondaryColor = &global;
-          }
-        } else if (md.IsBuiltIn) {
-          if (md.Value == spv::BuiltInClipDistance) {
-            m_clipDistance = &global;
-          }
-          if (md.Value == spv::BuiltInFrontFacing)
-            m_frontFacing = &global;
         }
       }
-    }
-  }
 
-  // If gl_in/gl_out used in shader, then the Gl deprecated builtin variable will be pack in the structure:
-  // gl_PerVertex. We need traversal the user of m_out to get the usage information Gl deprecated builtin variable.
-  if (m_out != nullptr) {
-    assert((m_clipVertex == nullptr) && (m_clipDistance == nullptr));
-    llvm::SmallVector<ShaderInOutMetadata> mds;
-    auto glOut = cast<GlobalVariable>(m_out);
-    MDNode *metaNode = glOut->getMetadata(gSPIRVMD::InOut);
-    assert(metaNode);
-    auto inOutMetaConst = mdconst::extract<Constant>(metaNode->getOperand(0));
-    for (User *user : m_out->users()) {
-      SmallVector<Value *> indexOperands;
-      // The user is a GEP
-      // Check to see if the value has been stored.
-      bool beenModified = false;
-      User *gep = nullptr;
-      assert(!isa<ConstantExpr>(user) && !isa<GetElementPtrInst>(user));
-      if (auto *gepInst = dyn_cast<StructuralGepOp>(user)) {
-        // We shouldn't have any chained GEPs here, they are coalesced by the LowerAccessChain pass.
-        assert(cast<ConstantInt>(*gepInst->getIndices().begin())->isZero() && "Non-zero GEP first index\n");
-        for (auto *idx : llvm::drop_begin(gepInst->getIndices()))
-          indexOperands.push_back(m_builder->CreateZExtOrTrunc(idx, m_builder->getInt32Ty()));
-        gep = gepInst;
-      }
-      if (gep != nullptr) {
-        for (User *gepUser : gep->users()) {
-          assert(!isa<StructuralGepOp>(gepUser));
-          beenModified |= isa<StoreInst>(gepUser);
+      if (inOutMeta.IsLoc) {
+        switch (builtInId) {
+        case Vkgc::GlCompatibilityInOutLocation::ClipVertex:
+          m_clipVertex = &global;
+          break;
+        case Vkgc::GlCompatibilityInOutLocation::FrontColor:
+          m_frontColor = &global;
+          break;
+        case Vkgc::GlCompatibilityInOutLocation::BackColor:
+          m_backColor = &global;
+          break;
+        case Vkgc::GlCompatibilityInOutLocation::FrontSecondaryColor:
+          m_frontSecondaryColor = &global;
+          break;
+        case Vkgc::GlCompatibilityInOutLocation::BackSecondaryColor:
+          m_backSecondaryColor = &global;
+          break;
+        default:
+          break;
         }
-        decodeInOutMetaRecursivelyByIndex(glOut->getValueType(), inOutMetaConst, indexOperands, mds);
-        for (auto md : mds) {
-          if (md.IsLoc) {
-            if (beenModified && (md.Value == Vkgc::GlCompatibilityInOutLocation::ClipVertex))
-              m_clipVertex = gep;
-            if (beenModified && (md.Value == Vkgc::GlCompatibilityInOutLocation::FrontColor))
-              m_frontColor = gep;
-            if (beenModified && (md.Value == Vkgc::GlCompatibilityInOutLocation::BackColor))
-              m_backColor = gep;
-            if (beenModified && (md.Value == Vkgc::GlCompatibilityInOutLocation::FrontSecondaryColor))
-              m_frontSecondaryColor = gep;
-            if (beenModified && (md.Value == Vkgc::GlCompatibilityInOutLocation::BackSecondaryColor))
-              m_backSecondaryColor = gep;
-          } else if (md.IsBuiltIn && md.Value == spv::BuiltInClipDistance) {
-            m_clipDistance = gep;
-          }
+      } else if (inOutMeta.IsBuiltIn) {
+        switch (builtInId) {
+        case spv::BuiltInClipDistance:
+          m_clipDistance = &global;
+          break;
+        case spv::BuiltInFrontFacing:
+          m_frontFacing = &global;
+          break;
+        case Vkgc::GlCompatibilityInOutLocation::BackColor:
+          m_backColor = &global;
+          break;
+        case Vkgc::GlCompatibilityInOutLocation::FrontSecondaryColor:
+          m_frontSecondaryColor = &global;
+          break;
+        case Vkgc::GlCompatibilityInOutLocation::BackSecondaryColor:
+          m_backSecondaryColor = &global;
+          break;
+        default:
+          break;
         }
       }
     }
@@ -427,31 +336,31 @@ void LowerGlCompatibility::buildPatchPositionInfo() {
 // =====================================================================================================================
 // Check whether need do lower for ClipVertex.
 bool LowerGlCompatibility::needLowerClipVertex() {
-  return (m_clipVertex != nullptr && !m_clipVertex->user_empty());
+  return m_clipVertex != nullptr;
 }
 
 // =====================================================================================================================
 // Check whether need do lower for FrontColor.
 bool LowerGlCompatibility::needLowerFrontColor() {
-  return (m_frontColor != nullptr && !m_frontColor->user_empty());
+  return m_frontColor != nullptr;
 }
 
 // =====================================================================================================================
 // Check whether need do lower for BackColor.
 bool LowerGlCompatibility::needLowerBackColor() {
-  return (m_backColor != nullptr && !m_backColor->user_empty());
+  return m_backColor != nullptr;
 }
 
 // =====================================================================================================================
 // Check whether need do lower for FrontSecondaryColor.
 bool LowerGlCompatibility::needLowerFrontSecondaryColor() {
-  return (m_frontSecondaryColor != nullptr && !m_frontSecondaryColor->user_empty());
+  return m_frontSecondaryColor != nullptr;
 }
 
 // =====================================================================================================================
 // Check whether need do lower for BackSecondaryColor.
 bool LowerGlCompatibility::needLowerBackSecondaryColor() {
-  return (m_backSecondaryColor != nullptr && !m_backSecondaryColor->user_empty());
+  return m_backSecondaryColor != nullptr;
 }
 
 // =====================================================================================================================
diff --git a/llpc/lowering/LowerGlCompatibility.h b/llpc/lowering/LowerGlCompatibility.h
index 6b7154879d..ba75e1083b 100644
--- a/llpc/lowering/LowerGlCompatibility.h
+++ b/llpc/lowering/LowerGlCompatibility.h
@@ -50,8 +50,7 @@ class LowerGlCompatibility : public SpirvLower, public llvm::PassInfoMixin<Lower
 private:
   bool needRun();
   unsigned getUniformLocation(llvm::GlobalVariable *var);
-  void decodeInOutMetaRecursivelyByIndex(llvm::Type *valueTy, llvm::Constant *mds, ArrayRef<Value *> index,
-                                         llvm::SmallVector<ShaderInOutMetadata> &out);
+  ShaderInOutMetadata getShaderInOutMetadata(Type *elementType, Constant *elementMetadata);
   void unifyFunctionReturn(Function *func);
   void collectEmitInst();
   void collectEmulationResource();
@@ -101,7 +100,6 @@ class LowerGlCompatibility : public SpirvLower, public llvm::PassInfoMixin<Lower
   llvm::BasicBlock *m_originalEntryBlock;          // The original entry block of entry point.
 
   // The resource use to lower gl_ClipVertex
-  llvm::User *m_out;                 // The global variable of gl_out[]
   llvm::User *m_clipVertex;          // The global variable of gl_ClipVertex
   llvm::User *m_clipDistance;        // The global variable of gl_ClipDistance
   llvm::User *m_clipPlane;           // The global variable of gl_ClipPlane
diff --git a/llpc/lowering/LowerGlobals.cpp b/llpc/lowering/LowerGlobals.cpp
index 2c66feabb2..4a1f9f7932 100644
--- a/llpc/lowering/LowerGlobals.cpp
+++ b/llpc/lowering/LowerGlobals.cpp
@@ -563,14 +563,15 @@ void LowerGlobals::lowerInOut(llvm::GlobalVariable *globalVar) {
         indices.emplace_back(idx);
       // NOTE: FoldGEP (all zero-index) will be removed, causing `replaceAllPointerUses` crash. Please don't use builder
       // interface, or fix the issue.
-      auto *gep = GetElementPtrInst::Create(sGep->getBaseType(), sGep->getBasePointer(), indices, "", sGep);
+      auto *gep =
+          GetElementPtrInst::Create(sGep->getBaseType(), sGep->getBasePointer(), indices, "", sGep->getIterator());
       sGep->replaceAllUsesWith(gep);
       sGep->eraseFromParent();
       indices.clear();
     }
 
     SmallVector<Instruction *> toErase;
-    CompilerUtils::replaceAllPointerUses(m_builder, globalVar, proxy, toErase);
+    CompilerUtils::replaceAllPointerUses(globalVar, proxy, toErase);
     for (auto inst : toErase)
       inst->eraseFromParent();
   } else {
@@ -1950,7 +1951,7 @@ void LowerGlobals::lowerPushConsts() {
       Value *pushConstants = m_builder->CreateLoadPushConstantsPtr();
 
       auto addrSpace = pushConstants->getType()->getPointerAddressSpace();
-      Type *const castType = global.getValueType()->getPointerTo(addrSpace);
+      Type *const castType = m_builder->getPtrTy(addrSpace);
       pushConstants = m_builder->CreateBitCast(pushConstants, castType);
 
       SmallVector<Instruction *, 8> usesToReplace;
diff --git a/llpc/lowering/LowerInternalLibraryIntrinsic.cpp b/llpc/lowering/LowerInternalLibraryIntrinsic.cpp
index 8090df71d6..fc473f029c 100644
--- a/llpc/lowering/LowerInternalLibraryIntrinsic.cpp
+++ b/llpc/lowering/LowerInternalLibraryIntrinsic.cpp
@@ -315,12 +315,12 @@ static void createLoadStore(Function *func, Builder *builder, bool isLoad, bool
   gpuAddrAsPtr = builder->CreateGEP(builder->getInt8Ty(), gpuAddrAsPtr, offset);
 
   // Cast to the return type pointer
-  Type *gpuAddrAsTy = builder->getInt32Ty();
-  gpuAddrAsPtrTy = gpuAddrAsTy->getPointerTo(SPIRAS_Global);
+  gpuAddrAsPtrTy = builder->getPtrTy(SPIRAS_Global);
   gpuAddrAsPtr = builder->CreateBitCast(gpuAddrAsPtr, gpuAddrAsPtrTy);
 
   // Load value
   if (isLoad) {
+    Type *gpuAddrAsTy = builder->getInt32Ty();
     Value *loadValue = builder->CreateLoad(gpuAddrAsTy, gpuAddrAsPtr, isUncached);
     builder->CreateRet(loadValue);
   } else {
@@ -392,6 +392,7 @@ InternalLibraryIntrinsicUtil::LibraryFunctionTable::LibraryFunctionTable() {
   m_libFuncPtrs["AmdExtLaneIndex"] = &createLaneIndex;
   m_libFuncPtrs["AmdExtLaneCount"] = &createLaneCount;
   m_libFuncPtrs["AmdExtHalt"] = &createHalt;
+  m_libFuncPtrs["AmdExtD3DShaderIntrinsics_Halt"] = &createHalt;
   m_libFuncPtrs["AmdExtDeviceMemoryAcquire"] = &createDeviceMemoryAcquire;
   m_libFuncPtrs["AmdExtDeviceMemoryRelease"] = &createDeviceMemoryRelease;
   m_libFuncPtrs["AmdExtNumWavesCompute"] = &createNumWavesCompute;
diff --git a/llpc/lowering/LowerMath.cpp b/llpc/lowering/LowerMath.cpp
index b5986bdf51..9cb85b7c5c 100644
--- a/llpc/lowering/LowerMath.cpp
+++ b/llpc/lowering/LowerMath.cpp
@@ -266,6 +266,7 @@ Function *LowerMathConstFolding::getEntryPoint() {
 
 bool LowerMathPrecision::adjustExports(Module &module, bool disablePositionOpt) {
   bool changed = false;
+  ShaderStage preFragmentStage = getLastVertexProcessingStage();
   for (auto &func : module.functions()) {
     // Disable fast math for gl_Position.
     // TODO: This requires knowledge of the Builder implementation, which is not ideal.
@@ -291,7 +292,7 @@ bool LowerMathPrecision::adjustExports(Module &module, bool disablePositionOpt)
         valueWritten = callInst->getOperand(0);
       }
 
-      if (valueWritten && builtIn == lgc::BuiltInPosition) {
+      if (valueWritten && builtIn == lgc::BuiltInPosition && m_shaderStage == preFragmentStage) {
         disableFastMath(valueWritten, disablePositionOpt);
         changed = true;
       }
@@ -300,6 +301,17 @@ bool LowerMathPrecision::adjustExports(Module &module, bool disablePositionOpt)
   return changed;
 }
 
+Vkgc::ShaderStage LowerMathPrecision::getLastVertexProcessingStage() const {
+  auto stageMask = m_context->getShaderStageMask();
+  for (auto stage : {Vkgc::ShaderStageMesh, Vkgc::ShaderStageGeometry, Vkgc::ShaderStageTessEval,
+                     Vkgc::ShaderStageTessControl, Vkgc::ShaderStageVertex}) {
+    unsigned int stageBit = 1 << stage;
+    if (stageMask & stageBit)
+      return stage;
+  }
+  return Vkgc::ShaderStageInvalid;
+}
+
 static bool clearContractFlag(Instruction *inst) {
   if (!isa<FPMathOperator>(inst))
     return false;
@@ -576,14 +588,15 @@ void LowerMathFloatOp::visitFPTruncInst(FPTruncInst &fptruncInst) {
     if (srcTy->getScalarType()->isDoubleTy() && destTy->getScalarType()->isHalfTy()) {
       // NOTE: double -> float16 conversion is done in backend compiler with RTE rounding. Thus, we have to split
       // it with two phases to disable such lowering if we need RTZ rounding.
+      IRBuilder<> builder(*m_context);
+      builder.SetInsertPoint(&fptruncInst);
       auto floatTy = srcTy->isVectorTy() ? FixedVectorType::get(Type::getFloatTy(*m_context),
                                                                 cast<FixedVectorType>(srcTy)->getNumElements())
                                          : Type::getFloatTy(*m_context);
-      auto floatValue = new FPTruncInst(src, floatTy, "", &fptruncInst);
-      auto dest = new FPTruncInst(floatValue, destTy, "", &fptruncInst);
+      auto floatValue = builder.CreateFPTrunc(src, floatTy);
+      auto dest = builder.CreateFPTrunc(floatValue, destTy);
 
       fptruncInst.replaceAllUsesWith(dest);
-      fptruncInst.dropAllReferences();
       fptruncInst.eraseFromParent();
 
       m_changed = true;
diff --git a/llpc/lowering/LowerMath.h b/llpc/lowering/LowerMath.h
index 2c2e702f63..fe416f7533 100644
--- a/llpc/lowering/LowerMath.h
+++ b/llpc/lowering/LowerMath.h
@@ -79,6 +79,7 @@ class LowerMathPrecision : public SpirvLower, public llvm::PassInfoMixin<LowerMa
 
   bool adjustExports(llvm::Module &module, bool clearAll);
   bool propagateNoContract(llvm::Module &module, bool forward, bool backward);
+  Vkgc::ShaderStage getLastVertexProcessingStage() const;
 };
 
 // =====================================================================================================================
diff --git a/llpc/lowering/LowerRayTracing.cpp b/llpc/lowering/LowerRayTracing.cpp
index 9b93a47208..c491711905 100644
--- a/llpc/lowering/LowerRayTracing.cpp
+++ b/llpc/lowering/LowerRayTracing.cpp
@@ -109,15 +109,144 @@ static unsigned TraceParamsTySize[] = {
     1, // 19, RayStaticId
 };
 
+namespace {
+
+class SpirvLowerRayTracingImpl : public SpirvLower {
+public:
+  SpirvLowerRayTracingImpl();
+  llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager);
+
+private:
+  void eraseFunctionBlocks(llvm::Function *func);
+  llvm::Value *createLoadInstanceIndexOrId(Value *instNodeAddr, bool isIndex);
+  llvm::Value *createLoadMatrixFromFunc(llvm::Value *matrixAddr, unsigned builtInId);
+  llvm::Function *getGpurtFunction(llvm::StringRef name);
+  void createTraceParams(llvm::Function *func);
+  void createRayGenEntryFunc();
+  unsigned generateTraceRayStaticId();
+  void processShaderRecordBuffer(llvm::GlobalVariable *global, llvm::Value *bufferDesc, llvm::Value *tableIndex,
+                                 llvm::Instruction *insertPos);
+  llvm::CallInst *createTraceRay();
+  void createSetHitAttributes(llvm::Function *func, unsigned instArgsNum, unsigned traceParamsOffset);
+  void createSetTraceParams(llvm::Function *func, unsigned instArgNum);
+  void createAnyHitFunc(llvm::Value *shaderIdentifier, llvm::Value *shaderRecordIndex);
+  void createCallShaderFunc(llvm::Function *func, ShaderStage stage, unsigned intersectId, llvm::Value *retVal,
+                            unsigned traceParamsArgOffset);
+  void createCallShader(llvm::Function *func, ShaderStage stage, unsigned intersectId, llvm::Value *shaderId,
+                        llvm::Value *shaderRecordIndex, llvm::Value *inputResult, llvm::BasicBlock *entryBlock,
+                        llvm::BasicBlock *endBlock, unsigned traceParamsArgOffset);
+  void updateGlobalFromCallShaderFunc(llvm::Function *func, ShaderStage stage, unsigned traceParamsArgOffset);
+  void createSetTriangleInsection(llvm::Function *func);
+  void createShaderSelection(llvm::Function *func, llvm::BasicBlock *entryBlock, llvm::BasicBlock *endBlock,
+                             llvm::Value *shaderId, unsigned intersectId, ShaderStage stage,
+                             llvm::ArrayRef<llvm::Value *> args, llvm::Value *result, llvm::Type *inResultTy);
+  llvm::Value *loadShaderTableVariable(ShaderTable tableKind, llvm::Value *bufferDesc);
+  llvm::Value *getShaderIdentifier(ShaderStage stage, llvm::Value *shaderRecordIndex, llvm::Value *bufferDesc);
+  void createDbgInfo(llvm::Module &module, llvm::Function *func);
+  void processTerminalFunc(llvm::Function *func, llvm::CallInst *inst, RayHitStatus hitStatus);
+  void initTraceParamsTy(unsigned attributeSize);
+  void initShaderBuiltIns();
+  void inlineTraceRay(llvm::CallInst *callInst, ModuleAnalysisManager &analysisManager);
+  llvm::Instruction *createEntryFunc(llvm::Function *func);
+  void createEntryTerminator(llvm::Function *func);
+  llvm::FunctionType *getShaderEntryFuncTy(ShaderStage stage, llvm::SmallVectorImpl<llvm::StringRef> &argNames);
+  llvm::FunctionType *getCallableShaderEntryFuncTy(llvm::SmallVectorImpl<llvm::StringRef> &argNames);
+  llvm::FunctionType *getTraceRayFuncTy();
+  void createDispatchRaysInfoDesc();
+  llvm::Instruction *createCallableShaderEntryFunc(llvm::Function *func);
+  void createCallableShaderEntryTerminator(llvm::Function *func);
+  llvm::SmallVector<llvm::Instruction *> getFuncRets(llvm::Function *func) const;
+  llvm::SmallSet<unsigned, 4> getShaderExtraInputParams(ShaderStage stage);
+  llvm::SmallSet<unsigned, 4> getShaderExtraRets(ShaderStage stage);
+  llvm::Type *getShaderReturnTy(ShaderStage stage);
+  void storeFunctionCallResult(ShaderStage stage, llvm::Value *result, llvm::Argument *traceIt);
+  void initInputResult(ShaderStage stage, llvm::Value *payload, llvm::Value *traceParams[], llvm::Value *result,
+                       llvm::Argument *traceIt);
+  void cloneDbgInfoSubprogram(llvm::Function *func, llvm::Function *newfunc);
+  llvm::Value *createLoadRayTracingMatrix(unsigned builtInId);
+  void createSetHitTriangleNodePointer(llvm::Function *func);
+  llvm::Function *getOrCreateRemapCapturedVaToReplayVaFunc();
+
+  void visitAcceptHitAndEndSearchOp(lgc::rt::AcceptHitAndEndSearchOp &inst);
+  void visitIgnoreHitOp(lgc::rt::IgnoreHitOp &inst);
+  void visitCallCallableShaderOp(lgc::rt::CallCallableShaderOp &inst);
+  void visitReportHitOp(lgc::rt::ReportHitOp &inst);
+  void visitTraceRayOp(lgc::rt::TraceRayOp &inst);
+  void processTraceRayCall(lgc::rt::BaseTraceRayOp *inst);
+
+  llvm::Function *createImplFunc(llvm::CallInst &inst, llvm::ArrayRef<Value *> args);
+
+  void visitGetHitAttributes(lgc::GpurtGetHitAttributesOp &inst);
+  void visitSetHitAttributes(lgc::GpurtSetHitAttributesOp &inst);
+  void visitSetTraceParams(lgc::GpurtSetTraceParamsOp &inst);
+  void visitCallClosestHitShader(lgc::GpurtCallClosestHitShaderOp &inst);
+  void visitCallMissShader(lgc::GpurtCallMissShaderOp &inst);
+  void visitCallTriangleAnyHitShader(lgc::GpurtCallTriangleAnyHitShaderOp &inst);
+  void visitCallIntersectionShader(lgc::GpurtCallIntersectionShaderOp &inst);
+  void visitSetTriangleIntersectionAttributes(lgc::GpurtSetTriangleIntersectionAttributesOp &inst);
+  void visitSetHitTriangleNodePointer(lgc::GpurtSetHitTriangleNodePointerOp &inst);
+  void visitGetParentId(lgc::GpurtGetParentIdOp &inst);
+  void visitSetParentId(lgc::GpurtSetParentIdOp &inst);
+  void visitGetRayStaticId(lgc::GpurtGetRayStaticIdOp &inst);
+  void visitStackReadOp(lgc::GpurtStackReadOp &inst);
+  void visitStackWriteOp(lgc::GpurtStackWriteOp &inst);
+  void visitLdsStackInitOp(lgc::GpurtLdsStackInitOp &inst);
+  void visitDispatchRayIndex(lgc::rt::DispatchRaysIndexOp &inst);
+  void visitDispatchRaysDimensionsOp(lgc::rt::DispatchRaysDimensionsOp &inst);
+  void visitWorldRayOriginOp(lgc::rt::WorldRayOriginOp &inst);
+  void visitWorldRayDirectionOp(lgc::rt::WorldRayDirectionOp &inst);
+  void visitObjectRayOriginOp(lgc::rt::ObjectRayOriginOp &inst);
+  void visitObjectRayDirectionOp(lgc::rt::ObjectRayDirectionOp &inst);
+  void visitRayTminOp(lgc::rt::RayTminOp &inst);
+  void visitRayTcurrentOp(lgc::rt::RayTcurrentOp &inst);
+  void visitInstanceIndexOp(lgc::rt::InstanceIndexOp &inst);
+  void visitObjectToWorldOp(lgc::rt::ObjectToWorldOp &inst);
+  void visitWorldToObjectOp(lgc::rt::WorldToObjectOp &inst);
+  void visitHitKindOp(lgc::rt::HitKindOp &inst);
+  void visitTriangleVertexPositionsOp(lgc::rt::TriangleVertexPositionsOp &inst);
+  void visitRayFlagsOp(lgc::rt::RayFlagsOp &inst);
+  void visitGeometryIndexOp(lgc::rt::GeometryIndexOp &inst);
+  void visitInstanceIdOp(lgc::rt::InstanceIdOp &inst);
+  void visitPrimitiveIndexOp(lgc::rt::PrimitiveIndexOp &inst);
+  void visitInstanceInclusionMaskOp(lgc::rt::InstanceInclusionMaskOp &inst);
+  void visitShaderIndexOp(lgc::rt::ShaderIndexOp &inst);
+  void visitShaderRecordBufferOp(lgc::rt::ShaderRecordBufferOp &inst);
+
+  void createSqttCallCompactToken(ShaderStage stage);
+  void createSqttFunctionReturnToken();
+
+  llvm::Value *createLoadInstNodeAddr();
+
+  lgc::rt::RayTracingShaderStage mapStageToLgcRtShaderStage(ShaderStage stage);
+  std::optional<CompilerUtils::CrossModuleInliner> m_crossModuleInliner;
+  unsigned m_spirvOpMetaKindId; // Metadata kind ID for "spirv.op"
+
+  llvm::Value *m_traceParams[TraceParam::Count]; // Trace ray set parameters
+  llvm::StringRef m_traceParamNames[TraceParam::Count];
+  llvm::Value *m_worldToObjMatrix = nullptr;               // World to Object matrix
+  llvm::AllocaInst *m_callableData = nullptr;              // Callable data variable for current callable shader
+  std::set<unsigned, std::less<unsigned>> m_builtInParams; // Indirect max builtins;
+  llvm::SmallVector<llvm::Type *, TraceParam::Count> m_traceParamsTys; // Trace Params types
+  llvm::SmallVector<llvm::Instruction *> m_callsToLower;               // Call instruction to lower
+  llvm::SmallSet<llvm::Function *, 4> m_funcsToLower;                  // Functions to lower
+  llvm::Value *m_dispatchRaysInfoDesc = nullptr;                       // Descriptor of the DispatchRaysInfo
+  llvm::Value *m_shaderRecordIndex = nullptr;                          // Variable sourced from entry function argument
+  llvm::Instruction *m_insertPosPastInit = nullptr; // Insert position after initialization instructions (storing trace
+                                                    // parameters, payload, callable data, etc.)
+  unsigned m_nextTraceRayId;                        // Next trace ray ID to be used for ray history
+};
+
+} // anonymous namespace
+
 // =====================================================================================================================
-SpirvLowerRayTracing::SpirvLowerRayTracing() : m_nextTraceRayId(0) {
+SpirvLowerRayTracingImpl::SpirvLowerRayTracingImpl() : m_nextTraceRayId(0) {
 }
 
 // =====================================================================================================================
 // Process a trace ray call by creating (or get if created) an implementation function and replace the call to it.
 //
 // @param inst : The original call instruction
-void SpirvLowerRayTracing::processTraceRayCall(BaseTraceRayOp *inst) {
+void SpirvLowerRayTracingImpl::processTraceRayCall(BaseTraceRayOp *inst) {
   m_builder->SetInsertPoint(inst);
 
   auto rayTracingContext = static_cast<RayTracingContext *>(m_context->getPipelineContext());
@@ -194,7 +323,7 @@ void SpirvLowerRayTracing::processTraceRayCall(BaseTraceRayOp *inst) {
 // Visits "lgc.rt.call.callable.shader" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitCallCallableShaderOp(CallCallableShaderOp &inst) {
+void SpirvLowerRayTracingImpl::visitCallCallableShaderOp(CallCallableShaderOp &inst) {
   std::string mangledName = inst.getCalledFunction()->getName().str() + ".impl";
 
   auto shaderIndex = inst.getShaderIndex();
@@ -287,7 +416,7 @@ void SpirvLowerRayTracing::visitCallCallableShaderOp(CallCallableShaderOp &inst)
 // Visits "lgc.rt.report.hit" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitReportHitOp(ReportHitOp &inst) {
+void SpirvLowerRayTracingImpl::visitReportHitOp(ReportHitOp &inst) {
   m_builder->SetInsertPoint(&inst);
 
   assert(m_shaderStage == ShaderStageRayTracingIntersect);
@@ -418,7 +547,7 @@ void SpirvLowerRayTracing::visitReportHitOp(ReportHitOp &inst) {
 //
 // @param [in/out] module : LLVM module to be run on
 // @param [in/out] analysisManager : Analysis manager to use for this transformation
-PreservedAnalyses SpirvLowerRayTracing::run(Module &module, ModuleAnalysisManager &analysisManager) {
+PreservedAnalyses SpirvLowerRayTracingImpl::run(Module &module, ModuleAnalysisManager &analysisManager) {
   LLVM_DEBUG(dbgs() << "Run the pass Spirv-Lower-Ray-Tracing\n");
 
   SpirvLower::init(&module);
@@ -465,21 +594,21 @@ PreservedAnalyses SpirvLowerRayTracing::run(Module &module, ModuleAnalysisManage
     m_entryPoint->setMetadata(RtName::ContinufyStageMeta,
                               MDNode::get(*m_context, ConstantAsMetadata::get(m_builder->getInt32(lgcRtStage))));
 
-    static auto visitor = llvm_dialects::VisitorBuilder<SpirvLowerRayTracing>()
+    static auto visitor = llvm_dialects::VisitorBuilder<SpirvLowerRayTracingImpl>()
                               .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration)
-                              .add(&SpirvLowerRayTracing::visitGetHitAttributes)
-                              .add(&SpirvLowerRayTracing::visitSetHitAttributes)
-                              .add(&SpirvLowerRayTracing::visitSetTraceParams)
-                              .add(&SpirvLowerRayTracing::visitCallClosestHitShader)
-                              .add(&SpirvLowerRayTracing::visitCallMissShader)
-                              .add(&SpirvLowerRayTracing::visitCallTriangleAnyHitShader)
-                              .add(&SpirvLowerRayTracing::visitCallIntersectionShader)
-                              .add(&SpirvLowerRayTracing::visitSetTriangleIntersectionAttributes)
-                              .add(&SpirvLowerRayTracing::visitSetHitTriangleNodePointer)
-                              .add(&SpirvLowerRayTracing::visitGetParentId)
-                              .add(&SpirvLowerRayTracing::visitSetParentId)
-                              .add(&SpirvLowerRayTracing::visitGetRayStaticId)
-                              .add(&SpirvLowerRayTracing::visitDispatchRayIndex)
+                              .add(&SpirvLowerRayTracingImpl::visitGetHitAttributes)
+                              .add(&SpirvLowerRayTracingImpl::visitSetHitAttributes)
+                              .add(&SpirvLowerRayTracingImpl::visitSetTraceParams)
+                              .add(&SpirvLowerRayTracingImpl::visitCallClosestHitShader)
+                              .add(&SpirvLowerRayTracingImpl::visitCallMissShader)
+                              .add(&SpirvLowerRayTracingImpl::visitCallTriangleAnyHitShader)
+                              .add(&SpirvLowerRayTracingImpl::visitCallIntersectionShader)
+                              .add(&SpirvLowerRayTracingImpl::visitSetTriangleIntersectionAttributes)
+                              .add(&SpirvLowerRayTracingImpl::visitSetHitTriangleNodePointer)
+                              .add(&SpirvLowerRayTracingImpl::visitGetParentId)
+                              .add(&SpirvLowerRayTracingImpl::visitSetParentId)
+                              .add(&SpirvLowerRayTracingImpl::visitGetRayStaticId)
+                              .add(&SpirvLowerRayTracingImpl::visitDispatchRayIndex)
                               .build();
 
     visitor.visit(*this, *m_entryPoint);
@@ -501,36 +630,36 @@ PreservedAnalyses SpirvLowerRayTracing::run(Module &module, ModuleAnalysisManage
 
     m_insertPosPastInit = insertPos;
 
-    static auto visitor = llvm_dialects::VisitorBuilder<SpirvLowerRayTracing>()
+    static auto visitor = llvm_dialects::VisitorBuilder<SpirvLowerRayTracingImpl>()
                               .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration)
-                              .add(&SpirvLowerRayTracing::visitAcceptHitAndEndSearchOp)
-                              .add(&SpirvLowerRayTracing::visitIgnoreHitOp)
-                              .add(&SpirvLowerRayTracing::visitCallCallableShaderOp)
-                              .add(&SpirvLowerRayTracing::visitReportHitOp)
-                              .add(&SpirvLowerRayTracing::visitTraceRayOp)
-                              .add(&SpirvLowerRayTracing::visitDispatchRayIndex)
-                              .add(&SpirvLowerRayTracing::visitDispatchRaysDimensionsOp)
-                              .add(&SpirvLowerRayTracing::visitWorldRayOriginOp)
-                              .add(&SpirvLowerRayTracing::visitWorldRayDirectionOp)
-                              .add(&SpirvLowerRayTracing::visitObjectRayOriginOp)
-                              .add(&SpirvLowerRayTracing::visitObjectRayDirectionOp)
-                              .add(&SpirvLowerRayTracing::visitRayTminOp)
-                              .add(&SpirvLowerRayTracing::visitRayTcurrentOp)
-                              .add(&SpirvLowerRayTracing::visitInstanceIndexOp)
-                              .add(&SpirvLowerRayTracing::visitObjectToWorldOp)
-                              .add(&SpirvLowerRayTracing::visitWorldToObjectOp)
-                              .add(&SpirvLowerRayTracing::visitHitKindOp)
-                              .add(&SpirvLowerRayTracing::visitTriangleVertexPositionsOp)
-                              .add(&SpirvLowerRayTracing::visitRayFlagsOp)
-                              .add(&SpirvLowerRayTracing::visitGeometryIndexOp)
-                              .add(&SpirvLowerRayTracing::visitInstanceIdOp)
-                              .add(&SpirvLowerRayTracing::visitPrimitiveIndexOp)
-                              .add(&SpirvLowerRayTracing::visitInstanceInclusionMaskOp)
-                              .add(&SpirvLowerRayTracing::visitShaderIndexOp)
-                              .add(&SpirvLowerRayTracing::visitShaderRecordBufferOp)
-                              .add(&SpirvLowerRayTracing::visitStackReadOp)
-                              .add(&SpirvLowerRayTracing::visitStackWriteOp)
-                              .add(&SpirvLowerRayTracing::visitLdsStackInitOp)
+                              .add(&SpirvLowerRayTracingImpl::visitAcceptHitAndEndSearchOp)
+                              .add(&SpirvLowerRayTracingImpl::visitIgnoreHitOp)
+                              .add(&SpirvLowerRayTracingImpl::visitCallCallableShaderOp)
+                              .add(&SpirvLowerRayTracingImpl::visitReportHitOp)
+                              .add(&SpirvLowerRayTracingImpl::visitTraceRayOp)
+                              .add(&SpirvLowerRayTracingImpl::visitDispatchRayIndex)
+                              .add(&SpirvLowerRayTracingImpl::visitDispatchRaysDimensionsOp)
+                              .add(&SpirvLowerRayTracingImpl::visitWorldRayOriginOp)
+                              .add(&SpirvLowerRayTracingImpl::visitWorldRayDirectionOp)
+                              .add(&SpirvLowerRayTracingImpl::visitObjectRayOriginOp)
+                              .add(&SpirvLowerRayTracingImpl::visitObjectRayDirectionOp)
+                              .add(&SpirvLowerRayTracingImpl::visitRayTminOp)
+                              .add(&SpirvLowerRayTracingImpl::visitRayTcurrentOp)
+                              .add(&SpirvLowerRayTracingImpl::visitInstanceIndexOp)
+                              .add(&SpirvLowerRayTracingImpl::visitObjectToWorldOp)
+                              .add(&SpirvLowerRayTracingImpl::visitWorldToObjectOp)
+                              .add(&SpirvLowerRayTracingImpl::visitHitKindOp)
+                              .add(&SpirvLowerRayTracingImpl::visitTriangleVertexPositionsOp)
+                              .add(&SpirvLowerRayTracingImpl::visitRayFlagsOp)
+                              .add(&SpirvLowerRayTracingImpl::visitGeometryIndexOp)
+                              .add(&SpirvLowerRayTracingImpl::visitInstanceIdOp)
+                              .add(&SpirvLowerRayTracingImpl::visitPrimitiveIndexOp)
+                              .add(&SpirvLowerRayTracingImpl::visitInstanceInclusionMaskOp)
+                              .add(&SpirvLowerRayTracingImpl::visitShaderIndexOp)
+                              .add(&SpirvLowerRayTracingImpl::visitShaderRecordBufferOp)
+                              .add(&SpirvLowerRayTracingImpl::visitStackReadOp)
+                              .add(&SpirvLowerRayTracingImpl::visitStackWriteOp)
+                              .add(&SpirvLowerRayTracingImpl::visitLdsStackInitOp)
                               .build();
 
     visitor.visit(*this, *m_module);
@@ -571,7 +700,7 @@ PreservedAnalyses SpirvLowerRayTracing::run(Module &module, ModuleAnalysisManage
 
 // =====================================================================================================================
 // Create alloc variable for the TraceParam
-void SpirvLowerRayTracing::createTraceParams(Function *entryFunc) {
+void SpirvLowerRayTracingImpl::createTraceParams(Function *entryFunc) {
   m_builder->SetInsertPointPastAllocas(entryFunc);
   for (unsigned i = 0; i < TraceParam::Count; ++i) {
     m_traceParams[i] =
@@ -585,7 +714,8 @@ void SpirvLowerRayTracing::createTraceParams(Function *entryFunc) {
 // @param func : Function to create
 // @param instArgsNum : Dialect instruction num
 // @param traceParamsOffset : TraceParams Offset
-void SpirvLowerRayTracing::createSetHitAttributes(Function *func, unsigned instArgsNum, unsigned traceParamsOffset) {
+void SpirvLowerRayTracingImpl::createSetHitAttributes(Function *func, unsigned instArgsNum,
+                                                      unsigned traceParamsOffset) {
   eraseFunctionBlocks(func);
   BasicBlock *entryBlock = BasicBlock::Create(*m_context, "", func);
   m_builder->SetInsertPoint(entryBlock);
@@ -614,7 +744,7 @@ void SpirvLowerRayTracing::createSetHitAttributes(Function *func, unsigned instA
 //
 // @param func : Function to create
 // @param instArgsNum : Dialect inst arguments count
-void SpirvLowerRayTracing::createSetTraceParams(Function *func, unsigned instArgsNum) {
+void SpirvLowerRayTracingImpl::createSetTraceParams(Function *func, unsigned instArgsNum) {
   eraseFunctionBlocks(func);
   BasicBlock *entryBlock = BasicBlock::Create(*m_context, "", func);
   m_builder->SetInsertPoint(entryBlock);
@@ -672,8 +802,8 @@ void SpirvLowerRayTracing::createSetTraceParams(Function *func, unsigned instArg
 // @param intersectId : Module ID of intersection shader
 // @param retVal : Function return value
 // @param traceParamsArgOffset : Non TraceParam arguments number
-void SpirvLowerRayTracing::createCallShaderFunc(Function *func, ShaderStage stage, unsigned intersectId, Value *retVal,
-                                                unsigned traceParamsArgOffset) {
+void SpirvLowerRayTracingImpl::createCallShaderFunc(Function *func, ShaderStage stage, unsigned intersectId,
+                                                    Value *retVal, unsigned traceParamsArgOffset) {
   auto rayTracingContext = static_cast<RayTracingContext *>(m_context->getPipelineContext());
   auto shaderStageMask = rayTracingContext->getShaderStageMask();
 
@@ -715,9 +845,10 @@ void SpirvLowerRayTracing::createCallShaderFunc(Function *func, ShaderStage stag
 // @param entryBlock : Entry block
 // @param endBlock : End block
 // @param traceParamsArgOffset : The count of beginning function non traceParam function arguments
-void SpirvLowerRayTracing::createCallShader(Function *func, ShaderStage stage, unsigned intersectId, Value *shaderId,
-                                            Value *shaderRecordIndex, Value *inputResult, BasicBlock *entryBlock,
-                                            BasicBlock *endBlock, unsigned traceParamsArgOffset) {
+void SpirvLowerRayTracingImpl::createCallShader(Function *func, ShaderStage stage, unsigned intersectId,
+                                                Value *shaderId, Value *shaderRecordIndex, Value *inputResult,
+                                                BasicBlock *entryBlock, BasicBlock *endBlock,
+                                                unsigned traceParamsArgOffset) {
   auto rayTracingContext = static_cast<RayTracingContext *>(m_context->getPipelineContext());
   auto indirectStageMask = rayTracingContext->getIndirectStageMask();
   bool indirectShader = indirectStageMask & shaderStageToMask(stage);
@@ -797,7 +928,7 @@ void SpirvLowerRayTracing::createCallShader(Function *func, ShaderStage stage, u
 // Patch library AmdTraceRaySetTriangleIntersectionAttributes function
 //
 // @param func : Function to create
-void SpirvLowerRayTracing::createSetTriangleInsection(Function *func) {
+void SpirvLowerRayTracingImpl::createSetTriangleInsection(Function *func) {
   eraseFunctionBlocks(func);
   BasicBlock *entryBlock = BasicBlock::Create(*m_context, "", func);
   m_builder->SetInsertPoint(entryBlock);
@@ -822,7 +953,7 @@ void SpirvLowerRayTracing::createSetTriangleInsection(Function *func) {
 //
 // @param tableKind : Kind of shader table variable to create
 // @param bufferDesc : Dispatch ray buffer descriptor
-Value *SpirvLowerRayTracing::loadShaderTableVariable(ShaderTable tableKind, Value *bufferDesc) {
+Value *SpirvLowerRayTracingImpl::loadShaderTableVariable(ShaderTable tableKind, Value *bufferDesc) {
   assert(tableKind < ShaderTable::Count);
   switch (tableKind) {
   case ShaderTable::RayGenTableAddr: {
@@ -946,9 +1077,9 @@ Value *SpirvLowerRayTracing::loadShaderTableVariable(ShaderTable tableKind, Valu
 // @param args : Argument list of function call
 // @param inResult : Allocated value to store function return value
 // @param inResultTy : Base type of inResult param
-void SpirvLowerRayTracing::createShaderSelection(Function *func, BasicBlock *entryBlock, BasicBlock *endBlock,
-                                                 Value *shaderId, unsigned intersectId, ShaderStage stage,
-                                                 ArrayRef<Value *> args, Value *inResult, Type *inResultTy) {
+void SpirvLowerRayTracingImpl::createShaderSelection(Function *func, BasicBlock *entryBlock, BasicBlock *endBlock,
+                                                     Value *shaderId, unsigned intersectId, ShaderStage stage,
+                                                     ArrayRef<Value *> args, Value *inResult, Type *inResultTy) {
   // .entry:
   // switch i32 %shaderId, label % .end[
   //    i32 2, label % .shader2
@@ -1004,7 +1135,7 @@ void SpirvLowerRayTracing::createShaderSelection(Function *func, BasicBlock *ent
 // @param stage : Shader stage
 // @param shaderRecordIndex : Shader table record index
 // @param bufferDesc : DispatchRay descriptor
-Value *SpirvLowerRayTracing::getShaderIdentifier(ShaderStage stage, Value *shaderRecordIndex, Value *bufferDesc) {
+Value *SpirvLowerRayTracingImpl::getShaderIdentifier(ShaderStage stage, Value *shaderRecordIndex, Value *bufferDesc) {
   ShaderTable tableAddr = ShaderTable::Count;
   ShaderTable tableStride = ShaderTable::Count;
   unsigned offset = 0;
@@ -1062,7 +1193,7 @@ Value *SpirvLowerRayTracing::getShaderIdentifier(ShaderStage stage, Value *shade
   Type *gpuAddrAsPtrTy = PointerType::get(*m_context, SPIRAS_Global);
   auto shaderIdentifierAsPtr = m_builder->CreateIntToPtr(tableAddrVal, gpuAddrAsPtrTy);
   Value *shaderIdentifier = m_builder->CreateGEP(m_builder->getInt8Ty(), shaderIdentifierAsPtr, offsetVal);
-  auto loadPtrTy = m_builder->getInt64Ty()->getPointerTo(SPIRAS_Global);
+  auto loadPtrTy = m_builder->getPtrTy(SPIRAS_Global);
   shaderIdentifier = m_builder->CreateBitCast(shaderIdentifier, loadPtrTy);
   shaderIdentifier = m_builder->CreateLoad(m_builder->getInt64Ty(), shaderIdentifier);
 
@@ -1074,7 +1205,7 @@ Value *SpirvLowerRayTracing::getShaderIdentifier(ShaderStage stage, Value *shade
 //
 // @param shaderIdentifier : Input shader identifier for the function
 // @param shaderRecordIndex : Shader record index
-void SpirvLowerRayTracing::createAnyHitFunc(Value *shaderIdentifier, Value *shaderRecordIndex) {
+void SpirvLowerRayTracingImpl::createAnyHitFunc(Value *shaderIdentifier, Value *shaderRecordIndex) {
   IRBuilderBase::InsertPointGuard ipg(*m_builder);
   Function *func = dyn_cast_or_null<Function>(m_module->getFunction(RtName::CallAnyHitShader));
   if (!func) {
@@ -1127,7 +1258,7 @@ void SpirvLowerRayTracing::createAnyHitFunc(Value *shaderIdentifier, Value *shad
 
 // =====================================================================================================================
 // Process ray gen functions, threads of launchId should not exceed the launchSize
-void SpirvLowerRayTracing::createRayGenEntryFunc() {
+void SpirvLowerRayTracingImpl::createRayGenEntryFunc() {
   // .entry
   //    %xgreat = icmp ge i32 %launchId.x, %launchSize.x
   //    %ygreat = icmp ge i32 %launchId.y, %launchSize.y
@@ -1250,7 +1381,7 @@ void SpirvLowerRayTracing::createRayGenEntryFunc() {
 //
 // @param module : LLVM module to be used by the DIBuilder
 // @param func : Function to process
-void SpirvLowerRayTracing::createDbgInfo(Module &module, Function *func) {
+void SpirvLowerRayTracingImpl::createDbgInfo(Module &module, Function *func) {
   DIBuilder builder(module);
   DIFile *file = builder.createFile(func->getName(), ".");
   builder.createCompileUnit(dwarf::DW_LANG_C99, file, "llvmIR", false, "", 0, "", DICompileUnit::LineTablesOnly);
@@ -1271,7 +1402,7 @@ void SpirvLowerRayTracing::createDbgInfo(Module &module, Function *func) {
 //
 // @param func : Old Function to be deprecated
 // @param newFunc : New Function to be processed
-void SpirvLowerRayTracing::cloneDbgInfoSubgrogram(llvm::Function *func, llvm::Function *newFunc) {
+void SpirvLowerRayTracingImpl::cloneDbgInfoSubprogram(llvm::Function *func, llvm::Function *newFunc) {
   if (auto subprogram = func->getSubprogram()) {
     auto metadata = MDString::get(*m_context, newFunc->getName());
     // Replace DISubProgram name and linkname to the new function name
@@ -1289,7 +1420,7 @@ void SpirvLowerRayTracing::cloneDbgInfoSubgrogram(llvm::Function *func, llvm::Fu
 // @param func : Processed function
 // @param callInst : CallInst of terminal op
 // @param hitStatus : Ray hit Status
-void SpirvLowerRayTracing::processTerminalFunc(Function *func, CallInst *callInst, RayHitStatus hitStatus) {
+void SpirvLowerRayTracingImpl::processTerminalFunc(Function *func, CallInst *callInst, RayHitStatus hitStatus) {
 
   // .entry:
   // ...
@@ -1320,7 +1451,7 @@ void SpirvLowerRayTracing::processTerminalFunc(Function *func, CallInst *callIns
 
 // =====================================================================================================================
 // Create traceray module entry function
-CallInst *SpirvLowerRayTracing::createTraceRay() {
+CallInst *SpirvLowerRayTracingImpl::createTraceRay() {
   assert(m_shaderStage == ShaderStageCompute);
 
   // Create traceRay module entry function
@@ -1522,7 +1653,7 @@ CallInst *SpirvLowerRayTracing::createTraceRay() {
 //
 // @param callInst : Where to inline function
 // @param analysisManager : : Analysis manager to use for this transformation
-void SpirvLowerRayTracing::inlineTraceRay(llvm::CallInst *callInst, ModuleAnalysisManager &analysisManager) {
+void SpirvLowerRayTracingImpl::inlineTraceRay(llvm::CallInst *callInst, ModuleAnalysisManager &analysisManager) {
   FunctionAnalysisManager &fam = analysisManager.getResult<FunctionAnalysisManagerModuleProxy>(*m_module).getManager();
   auto getAssumptionCache = [&](Function &F) -> AssumptionCache & { return fam.getResult<AssumptionAnalysis>(F); };
   auto getBFI = [&](Function &F) -> BlockFrequencyInfo & { return fam.getResult<BlockFrequencyAnalysis>(F); };
@@ -1552,7 +1683,7 @@ void SpirvLowerRayTracing::inlineTraceRay(llvm::CallInst *callInst, ModuleAnalys
 // init TraceParam types
 //
 // @param traceParam : trace params
-void SpirvLowerRayTracing::initTraceParamsTy(unsigned attributeSize) {
+void SpirvLowerRayTracingImpl::initTraceParamsTy(unsigned attributeSize) {
   auto floatx3Ty = FixedVectorType::get(Type::getFloatTy(*m_context), 3);
   auto rayTracingContext = static_cast<RayTracingContext *>(m_context->getPipelineContext());
   const auto payloadType = rayTracingContext->getPayloadType(m_builder);
@@ -1604,7 +1735,7 @@ void SpirvLowerRayTracing::initTraceParamsTy(unsigned attributeSize) {
 
 // =====================================================================================================================
 // Initialize builting for shader call
-void SpirvLowerRayTracing::initShaderBuiltIns() {
+void SpirvLowerRayTracingImpl::initShaderBuiltIns() {
   assert(m_builtInParams.size() == 0);
   auto rayTracingContext = static_cast<RayTracingContext *>(m_context->getPipelineContext());
   const auto *buildInfo = rayTracingContext->getRayTracingPipelineBuildInfo();
@@ -1710,7 +1841,7 @@ void SpirvLowerRayTracing::initShaderBuiltIns() {
 //
 // @param func : The shader stage of entry function
 // @param argNames : Filled with the names of arguments
-FunctionType *SpirvLowerRayTracing::getShaderEntryFuncTy(ShaderStage stage, SmallVectorImpl<StringRef> &argNames) {
+FunctionType *SpirvLowerRayTracingImpl::getShaderEntryFuncTy(ShaderStage stage, SmallVectorImpl<StringRef> &argNames) {
   SmallVector<Type *, 8> argTys;
 
   auto retTy = getShaderReturnTy(stage);
@@ -1735,7 +1866,7 @@ FunctionType *SpirvLowerRayTracing::getShaderEntryFuncTy(ShaderStage stage, Smal
 // Mutate entry function for the shader stage, ClosestHit, Intersect, AnyHit, Miss
 //
 // @param func : Function to create
-Instruction *SpirvLowerRayTracing::createEntryFunc(Function *func) {
+Instruction *SpirvLowerRayTracingImpl::createEntryFunc(Function *func) {
   // Set old entry function name deprecated
   func->setName("deprecated");
 
@@ -1764,7 +1895,7 @@ Instruction *SpirvLowerRayTracing::createEntryFunc(Function *func) {
   }
 
   // Transfer DiSubprogram to the new function
-  cloneDbgInfoSubgrogram(func, newFunc);
+  cloneDbgInfoSubprogram(func, newFunc);
 
   // Now entry function pointer to the new function
   m_entryPoint = newFunc;
@@ -1803,8 +1934,8 @@ Instruction *SpirvLowerRayTracing::createEntryFunc(Function *func) {
 // @param func : Function to create
 // @param stage : Ray tracing shader stage
 // @param traceParamsArgOffset : Non traceParam arguments number
-void SpirvLowerRayTracing::updateGlobalFromCallShaderFunc(Function *func, ShaderStage stage,
-                                                          unsigned traceParamsArgOffset) {
+void SpirvLowerRayTracingImpl::updateGlobalFromCallShaderFunc(Function *func, ShaderStage stage,
+                                                              unsigned traceParamsArgOffset) {
   auto zero = m_builder->getInt32(0);
   auto one = m_builder->getInt32(1);
 
@@ -1826,7 +1957,7 @@ void SpirvLowerRayTracing::updateGlobalFromCallShaderFunc(Function *func, Shader
 
 // =====================================================================================================================
 // Get callabe shader entry function type
-FunctionType *SpirvLowerRayTracing::getCallableShaderEntryFuncTy(SmallVectorImpl<StringRef> &argNames) {
+FunctionType *SpirvLowerRayTracingImpl::getCallableShaderEntryFuncTy(SmallVectorImpl<StringRef> &argNames) {
   auto rayTracingContext = static_cast<RayTracingContext *>(m_context->getPipelineContext());
   SmallVector<Type *, 8> argTys;
   auto callableDataTy = rayTracingContext->getCallableDataType(m_builder);
@@ -1841,7 +1972,7 @@ FunctionType *SpirvLowerRayTracing::getCallableShaderEntryFuncTy(SmallVectorImpl
 
 // =====================================================================================================================
 // Get traceray function type
-FunctionType *SpirvLowerRayTracing::getTraceRayFuncTy() {
+FunctionType *SpirvLowerRayTracingImpl::getTraceRayFuncTy() {
   auto rayTracingContext = static_cast<RayTracingContext *>(m_context->getPipelineContext());
   auto retTy = rayTracingContext->getPayloadType(m_builder);
   SmallVector<Type *, 11> argsTys = {
@@ -1872,7 +2003,7 @@ FunctionType *SpirvLowerRayTracing::getTraceRayFuncTy() {
 // Mutate entry function for the shader stage callable shader
 //
 // @param func : Function to create
-Instruction *SpirvLowerRayTracing::createCallableShaderEntryFunc(Function *func) {
+Instruction *SpirvLowerRayTracingImpl::createCallableShaderEntryFunc(Function *func) {
   // Set old entry function name deprecated
   func->setName("deprecatedCallableShader");
 
@@ -1898,7 +2029,7 @@ Instruction *SpirvLowerRayTracing::createCallableShaderEntryFunc(Function *func)
   }
 
   // Transfer DiSubprogram to the new function
-  cloneDbgInfoSubgrogram(func, newFunc);
+  cloneDbgInfoSubprogram(func, newFunc);
 
   // Now entry function pointer to the new function
   m_entryPoint = newFunc;
@@ -1924,7 +2055,7 @@ Instruction *SpirvLowerRayTracing::createCallableShaderEntryFunc(Function *func)
 // Get all the function ReturnInst
 //
 // @param func : Function to gather ReturnInst
-SmallVector<Instruction *> SpirvLowerRayTracing::getFuncRets(Function *func) const {
+SmallVector<Instruction *> SpirvLowerRayTracingImpl::getFuncRets(Function *func) const {
   SmallVector<Instruction *> rets;
   for (auto &block : *func) {
     auto blockTerm = block.getTerminator();
@@ -1938,7 +2069,7 @@ SmallVector<Instruction *> SpirvLowerRayTracing::getFuncRets(Function *func) con
 // Get the extra parameters needed for calling indirect shader
 //
 // @param stage : The shader stage of shader to call
-SmallSet<unsigned, 4> SpirvLowerRayTracing::getShaderExtraInputParams(ShaderStage stage) {
+SmallSet<unsigned, 4> SpirvLowerRayTracingImpl::getShaderExtraInputParams(ShaderStage stage) {
   SmallSet<unsigned, 4> params;
 
   switch (stage) {
@@ -1974,7 +2105,7 @@ SmallSet<unsigned, 4> SpirvLowerRayTracing::getShaderExtraInputParams(ShaderStag
 // Get the extra return values needed for indirect shader, in addition to payload
 //
 // @param stage : The shader stage
-SmallSet<unsigned, 4> SpirvLowerRayTracing::getShaderExtraRets(ShaderStage stage) {
+SmallSet<unsigned, 4> SpirvLowerRayTracingImpl::getShaderExtraRets(ShaderStage stage) {
   auto rayTracingContext = static_cast<RayTracingContext *>(m_context->getPipelineContext());
   SmallSet<unsigned, 4> rets;
 
@@ -2003,7 +2134,7 @@ SmallSet<unsigned, 4> SpirvLowerRayTracing::getShaderExtraRets(ShaderStage stage
 // Get return type for specific shader stage
 //
 // @param stage : The shader stage
-Type *SpirvLowerRayTracing::getShaderReturnTy(ShaderStage stage) {
+Type *SpirvLowerRayTracingImpl::getShaderReturnTy(ShaderStage stage) {
   auto rayTracingContext = static_cast<RayTracingContext *>(m_context->getPipelineContext());
 
   // Return payload in default
@@ -2021,7 +2152,7 @@ Type *SpirvLowerRayTracing::getShaderReturnTy(ShaderStage stage) {
 //
 // @param stage : The shader stage
 // @param result : The result to store
-void SpirvLowerRayTracing::storeFunctionCallResult(ShaderStage stage, Value *result, Argument *traceParamsIt) {
+void SpirvLowerRayTracingImpl::storeFunctionCallResult(ShaderStage stage, Value *result, Argument *traceParamsIt) {
   auto rayTracingContext = static_cast<RayTracingContext *>(m_context->getPipelineContext());
 
   unsigned payloadSizeInDword = rayTracingContext->getPayloadSizeInDword();
@@ -2069,8 +2200,8 @@ void SpirvLowerRayTracing::storeFunctionCallResult(ShaderStage stage, Value *res
 // @param traceParams : The value to initialize second part of inputResult
 // @param result : The result to initialize
 // @param traceParams : TraceParam argument
-void SpirvLowerRayTracing::initInputResult(ShaderStage stage, Value *payload, Value *traceParams[], Value *result,
-                                           Argument *traceParamsIt) {
+void SpirvLowerRayTracingImpl::initInputResult(ShaderStage stage, Value *payload, Value *traceParams[], Value *result,
+                                               Argument *traceParamsIt) {
   auto rayTracingContext = static_cast<RayTracingContext *>(m_context->getPipelineContext());
 
   unsigned payloadSizeInDword = rayTracingContext->getPayloadSizeInDword();
@@ -2115,7 +2246,7 @@ void SpirvLowerRayTracing::initInputResult(ShaderStage stage, Value *payload, Va
 // Load ObjectToWorld or WorldToObject matrix
 //
 // @param builtInId : ID of the built-in variable
-Value *SpirvLowerRayTracing::createLoadRayTracingMatrix(unsigned builtInId) {
+Value *SpirvLowerRayTracingImpl::createLoadRayTracingMatrix(unsigned builtInId) {
   assert(builtInId == BuiltInWorldToObjectKHR || builtInId == BuiltInObjectToWorldKHR);
 
   IRBuilderBase::InsertPointGuard guard(*m_builder);
@@ -2132,7 +2263,7 @@ Value *SpirvLowerRayTracing::createLoadRayTracingMatrix(unsigned builtInId) {
 // Process AmdTraceRaySetHitTriangleNodePointer function
 //
 // @param func : The function to create
-void SpirvLowerRayTracing::createSetHitTriangleNodePointer(Function *func) {
+void SpirvLowerRayTracingImpl::createSetHitTriangleNodePointer(Function *func) {
   eraseFunctionBlocks(func);
   BasicBlock *entryBlock = BasicBlock::Create(*m_context, "", func);
   m_builder->SetInsertPoint(entryBlock);
@@ -2164,7 +2295,7 @@ void SpirvLowerRayTracing::createSetHitTriangleNodePointer(Function *func) {
 // Process entry function return instruction, replace new return payload/etc info
 //
 // @param func : The function to process
-void SpirvLowerRayTracing::createEntryTerminator(Function *func) {
+void SpirvLowerRayTracingImpl::createEntryTerminator(Function *func) {
   // Return incoming payload, and other values if needed
   auto rayTracingContext = static_cast<RayTracingContext *>(m_context->getPipelineContext());
   for (auto ret : getFuncRets(func)) {
@@ -2211,7 +2342,7 @@ void SpirvLowerRayTracing::createEntryTerminator(Function *func) {
 // Add return callable data
 //
 // @param func : The function to process
-void SpirvLowerRayTracing::createCallableShaderEntryTerminator(Function *func) {
+void SpirvLowerRayTracingImpl::createCallableShaderEntryTerminator(Function *func) {
   // return global callable data
   for (auto ret : getFuncRets(func)) {
     m_builder->SetInsertPoint(ret);
@@ -2224,7 +2355,7 @@ void SpirvLowerRayTracing::createCallableShaderEntryTerminator(Function *func) {
 
 // =====================================================================================================================
 // Get RemapCapturedVaToReplayVa function for indirect pipeline capture replay, create it if it does not exist.
-Function *SpirvLowerRayTracing::getOrCreateRemapCapturedVaToReplayVaFunc() {
+Function *SpirvLowerRayTracingImpl::getOrCreateRemapCapturedVaToReplayVaFunc() {
   Function *func = dyn_cast_or_null<Function>(m_module->getFunction(RtName::RemapCapturedVaToReplayVa));
   // uint64_t RemapCapturedVaToReplayVa(uint64_t shdaerId) {
   //   // InternalBuffer contains array of Vkgc::RayTracingCaptureReplayVaMappingEntry
@@ -2318,7 +2449,7 @@ Function *SpirvLowerRayTracing::getOrCreateRemapCapturedVaToReplayVaFunc() {
 // =====================================================================================================================
 // Get DispatchRaysInfo Descriptor
 //
-void SpirvLowerRayTracing::createDispatchRaysInfoDesc() {
+void SpirvLowerRayTracingImpl::createDispatchRaysInfoDesc() {
   if (!m_dispatchRaysInfoDesc) {
     m_dispatchRaysInfoDesc = m_builder->create<lgc::LoadBufferDescOp>(
         TraceRayDescriptorSet, RayTracingResourceIndexDispatchRaysInfo, m_builder->getInt32(0), 0);
@@ -2330,7 +2461,7 @@ void SpirvLowerRayTracing::createDispatchRaysInfoDesc() {
 // Visits "lgc.rt.accept.hit.and.end.search" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitAcceptHitAndEndSearchOp(AcceptHitAndEndSearchOp &inst) {
+void SpirvLowerRayTracingImpl::visitAcceptHitAndEndSearchOp(AcceptHitAndEndSearchOp &inst) {
   processTerminalFunc(m_entryPoint, &cast<CallInst>(inst), RayHitStatus::AcceptAndEndSearch);
 }
 
@@ -2338,7 +2469,7 @@ void SpirvLowerRayTracing::visitAcceptHitAndEndSearchOp(AcceptHitAndEndSearchOp
 // Visits "lgc.rt.ignore.hit" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitIgnoreHitOp(IgnoreHitOp &inst) {
+void SpirvLowerRayTracingImpl::visitIgnoreHitOp(IgnoreHitOp &inst) {
   processTerminalFunc(m_entryPoint, &cast<CallInst>(inst), RayHitStatus::Ignore);
 }
 
@@ -2346,7 +2477,7 @@ void SpirvLowerRayTracing::visitIgnoreHitOp(IgnoreHitOp &inst) {
 // Visits "lgc.rt.trace.ray" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitTraceRayOp(TraceRayOp &inst) {
+void SpirvLowerRayTracingImpl::visitTraceRayOp(TraceRayOp &inst) {
   processTraceRayCall(&inst);
 }
 
@@ -2354,7 +2485,7 @@ void SpirvLowerRayTracing::visitTraceRayOp(TraceRayOp &inst) {
 // Visits "lgc.gpurt.get.hit.attributes" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitGetHitAttributes(lgc::GpurtGetHitAttributesOp &inst) {
+void SpirvLowerRayTracingImpl::visitGetHitAttributes(lgc::GpurtGetHitAttributesOp &inst) {
   m_builder->SetInsertPoint(&inst);
   Value *tCurrent = m_builder->CreateLoad(m_traceParamsTys[TraceParam::TCurrent], m_traceParams[TraceParam::TCurrent]);
   Value *kind = m_builder->CreateLoad(m_traceParamsTys[TraceParam::Kind], m_traceParams[TraceParam::Kind]);
@@ -2372,7 +2503,7 @@ void SpirvLowerRayTracing::visitGetHitAttributes(lgc::GpurtGetHitAttributesOp &i
 // Visits "lgc.gpurt.set.hit.attributes" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitSetHitAttributes(lgc::GpurtSetHitAttributesOp &inst) {
+void SpirvLowerRayTracingImpl::visitSetHitAttributes(lgc::GpurtSetHitAttributesOp &inst) {
   m_builder->SetInsertPoint(&inst);
 
   ArrayRef<Value *> args(&m_traceParams[TraceParam::TMin], TraceParam::GeometryIndex - TraceParam::TMin + 1);
@@ -2389,7 +2520,7 @@ void SpirvLowerRayTracing::visitSetHitAttributes(lgc::GpurtSetHitAttributesOp &i
 // Visits "lgc.gpurt.set.trace.params" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitSetTraceParams(lgc::GpurtSetTraceParamsOp &inst) {
+void SpirvLowerRayTracingImpl::visitSetTraceParams(lgc::GpurtSetTraceParamsOp &inst) {
   m_builder->SetInsertPoint(&inst);
   ArrayRef<Value *> args(m_traceParams, TraceParam::TMax + 1);
   auto func = createImplFunc(inst, args);
@@ -2405,7 +2536,7 @@ void SpirvLowerRayTracing::visitSetTraceParams(lgc::GpurtSetTraceParamsOp &inst)
 // Visits "lgc.gpurt.call.closest.hit.shader" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitCallClosestHitShader(lgc::GpurtCallClosestHitShaderOp &inst) {
+void SpirvLowerRayTracingImpl::visitCallClosestHitShader(lgc::GpurtCallClosestHitShaderOp &inst) {
   m_builder->SetInsertPoint(&inst);
   ArrayRef<Value *> args(m_traceParams, TraceParam::Count);
 
@@ -2423,7 +2554,7 @@ void SpirvLowerRayTracing::visitCallClosestHitShader(lgc::GpurtCallClosestHitSha
 // Visits "lgc.gpurt.call.miss.shader" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitCallMissShader(lgc::GpurtCallMissShaderOp &inst) {
+void SpirvLowerRayTracingImpl::visitCallMissShader(lgc::GpurtCallMissShaderOp &inst) {
   m_builder->SetInsertPoint(&inst);
   ArrayRef<Value *> args(m_traceParams, TraceParam::Count);
   auto func = createImplFunc(inst, args);
@@ -2440,7 +2571,7 @@ void SpirvLowerRayTracing::visitCallMissShader(lgc::GpurtCallMissShaderOp &inst)
 // Visits "lgc.gpurt.call.triangle.any.hit.shader" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitCallTriangleAnyHitShader(lgc::GpurtCallTriangleAnyHitShaderOp &inst) {
+void SpirvLowerRayTracingImpl::visitCallTriangleAnyHitShader(lgc::GpurtCallTriangleAnyHitShaderOp &inst) {
   m_builder->SetInsertPoint(&inst);
   ArrayRef<Value *> args(m_traceParams, TraceParam::Count);
   auto func = createImplFunc(inst, args);
@@ -2457,7 +2588,7 @@ void SpirvLowerRayTracing::visitCallTriangleAnyHitShader(lgc::GpurtCallTriangleA
 // Visits "lgc.gpurt.call.intersection.shader" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitCallIntersectionShader(lgc::GpurtCallIntersectionShaderOp &inst) {
+void SpirvLowerRayTracingImpl::visitCallIntersectionShader(lgc::GpurtCallIntersectionShaderOp &inst) {
   m_builder->SetInsertPoint(&inst);
   ArrayRef<Value *> args(m_traceParams, TraceParam::Count);
   auto func = createImplFunc(inst, args);
@@ -2474,7 +2605,8 @@ void SpirvLowerRayTracing::visitCallIntersectionShader(lgc::GpurtCallIntersectio
 // Visits "lgc.gpurt.set.triangle.intersection.attributes" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitSetTriangleIntersectionAttributes(lgc::GpurtSetTriangleIntersectionAttributesOp &inst) {
+void SpirvLowerRayTracingImpl::visitSetTriangleIntersectionAttributes(
+    lgc::GpurtSetTriangleIntersectionAttributesOp &inst) {
   m_builder->SetInsertPoint(&inst);
 
   auto func = createImplFunc(inst, {m_traceParams[TraceParam::HitAttributes]});
@@ -2490,7 +2622,7 @@ void SpirvLowerRayTracing::visitSetTriangleIntersectionAttributes(lgc::GpurtSetT
 // Visits "lgc.gpurt.set.hit.triangle.node.pointer" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitSetHitTriangleNodePointer(lgc::GpurtSetHitTriangleNodePointerOp &inst) {
+void SpirvLowerRayTracingImpl::visitSetHitTriangleNodePointer(lgc::GpurtSetHitTriangleNodePointerOp &inst) {
   m_builder->SetInsertPoint(&inst);
 
   auto func = createImplFunc(inst, {m_traceParams[TraceParam::HitTriangleVertexPositions]});
@@ -2506,7 +2638,7 @@ void SpirvLowerRayTracing::visitSetHitTriangleNodePointer(lgc::GpurtSetHitTriang
 // Visits "lgc.gpurt.get.ray.static.id" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitGetRayStaticId(lgc::GpurtGetRayStaticIdOp &inst) {
+void SpirvLowerRayTracingImpl::visitGetRayStaticId(lgc::GpurtGetRayStaticIdOp &inst) {
   m_builder->SetInsertPoint(&inst);
 
   auto rayStaticId = m_builder->CreateLoad(m_builder->getInt32Ty(), m_traceParams[TraceParam::RayStaticId]);
@@ -2520,7 +2652,7 @@ void SpirvLowerRayTracing::visitGetRayStaticId(lgc::GpurtGetRayStaticIdOp &inst)
 // Visits "lgc.gpurt.stack.read" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitStackReadOp(lgc::GpurtStackReadOp &inst) {
+void SpirvLowerRayTracingImpl::visitStackReadOp(lgc::GpurtStackReadOp &inst) {
   // NOTE: If RayQuery is used inside intersection or any-hit shaders, where we already holding a traversal stack for
   // TraceRay, perform the stack operations for this RayQuery in an extra stack space.
   if ((m_shaderStage == ShaderStageRayTracingIntersect) || (m_shaderStage == ShaderStageRayTracingAnyHit))
@@ -2531,7 +2663,7 @@ void SpirvLowerRayTracing::visitStackReadOp(lgc::GpurtStackReadOp &inst) {
 // Visits "lgc.gpurt.stack.write" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitStackWriteOp(lgc::GpurtStackWriteOp &inst) {
+void SpirvLowerRayTracingImpl::visitStackWriteOp(lgc::GpurtStackWriteOp &inst) {
   // NOTE: If RayQuery is used inside intersection or any-hit shaders, where we already holding a traversal stack for
   // TraceRay, perform the stack operations for this RayQuery in an extra stack space.
   if ((m_shaderStage == ShaderStageRayTracingIntersect) || (m_shaderStage == ShaderStageRayTracingAnyHit))
@@ -2542,7 +2674,7 @@ void SpirvLowerRayTracing::visitStackWriteOp(lgc::GpurtStackWriteOp &inst) {
 // Visits "lgc.gpurt.stack.init" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitLdsStackInitOp(lgc::GpurtLdsStackInitOp &inst) {
+void SpirvLowerRayTracingImpl::visitLdsStackInitOp(lgc::GpurtLdsStackInitOp &inst) {
   // NOTE: If RayQuery is used inside any-hit shaders, where we already holding a traversal stack for
   // TraceRay, perform the stack operations for this RayQuery in an extra stack space.
   if (m_shaderStage == ShaderStageRayTracingAnyHit)
@@ -2553,7 +2685,7 @@ void SpirvLowerRayTracing::visitLdsStackInitOp(lgc::GpurtLdsStackInitOp &inst) {
 // Visits "lgc.gpurt.get.parent.id" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitGetParentId(lgc::GpurtGetParentIdOp &inst) {
+void SpirvLowerRayTracingImpl::visitGetParentId(lgc::GpurtGetParentIdOp &inst) {
   m_builder->SetInsertPoint(&inst);
 
   auto parentId = m_builder->CreateLoad(m_builder->getInt32Ty(), m_traceParams[TraceParam::ParentRayId]);
@@ -2567,7 +2699,7 @@ void SpirvLowerRayTracing::visitGetParentId(lgc::GpurtGetParentIdOp &inst) {
 // Visits "lgc.gpurt.set.parent.id" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitSetParentId(lgc::GpurtSetParentIdOp &inst) {
+void SpirvLowerRayTracingImpl::visitSetParentId(lgc::GpurtSetParentIdOp &inst) {
   m_builder->SetInsertPoint(&inst);
 
   m_builder->CreateStore(inst.getRayId(), m_traceParams[TraceParam::ParentRayId]);
@@ -2580,7 +2712,7 @@ void SpirvLowerRayTracing::visitSetParentId(lgc::GpurtSetParentIdOp &inst) {
 // Visits "lgc.rt.dispatch.rays.index" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitDispatchRayIndex(lgc::rt::DispatchRaysIndexOp &inst) {
+void SpirvLowerRayTracingImpl::visitDispatchRayIndex(lgc::rt::DispatchRaysIndexOp &inst) {
   m_builder->SetInsertPoint(&inst);
 
   auto dispatchRayIndex = m_builder->CreateReadBuiltInInput(lgc::BuiltInGlobalInvocationId);
@@ -2594,7 +2726,7 @@ void SpirvLowerRayTracing::visitDispatchRayIndex(lgc::rt::DispatchRaysIndexOp &i
 // Visits "lgc.rt.dispatch.rays.dimensions" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitDispatchRaysDimensionsOp(lgc::rt::DispatchRaysDimensionsOp &inst) {
+void SpirvLowerRayTracingImpl::visitDispatchRaysDimensionsOp(lgc::rt::DispatchRaysDimensionsOp &inst) {
   m_builder->SetInsertPoint(&inst);
 
   auto dispatchRaysDimensions = loadShaderTableVariable(ShaderTable::LaunchSize, m_dispatchRaysInfoDesc);
@@ -2608,7 +2740,7 @@ void SpirvLowerRayTracing::visitDispatchRaysDimensionsOp(lgc::rt::DispatchRaysDi
 // Visits "lgc.rt.world.ray.origin" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitWorldRayOriginOp(lgc::rt::WorldRayOriginOp &inst) {
+void SpirvLowerRayTracingImpl::visitWorldRayOriginOp(lgc::rt::WorldRayOriginOp &inst) {
   m_builder->SetInsertPoint(&inst);
 
   auto worldRayOrigin = m_builder->CreateLoad(m_traceParamsTys[TraceParam::Origin], m_traceParams[TraceParam::Origin]);
@@ -2622,7 +2754,7 @@ void SpirvLowerRayTracing::visitWorldRayOriginOp(lgc::rt::WorldRayOriginOp &inst
 // Visits "lgc.rt.world.ray.direction" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitWorldRayDirectionOp(lgc::rt::WorldRayDirectionOp &inst) {
+void SpirvLowerRayTracingImpl::visitWorldRayDirectionOp(lgc::rt::WorldRayDirectionOp &inst) {
   m_builder->SetInsertPoint(&inst);
 
   auto worldRayDir = m_builder->CreateLoad(m_traceParamsTys[TraceParam::Dir], m_traceParams[TraceParam::Dir]);
@@ -2636,7 +2768,7 @@ void SpirvLowerRayTracing::visitWorldRayDirectionOp(lgc::rt::WorldRayDirectionOp
 // Visits "lgc.rt.object.ray.origin" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitObjectRayOriginOp(lgc::rt::ObjectRayOriginOp &inst) {
+void SpirvLowerRayTracingImpl::visitObjectRayOriginOp(lgc::rt::ObjectRayOriginOp &inst) {
   m_builder->SetInsertPoint(&inst);
 
   Value *origin = m_builder->CreateLoad(m_traceParamsTys[TraceParam::Origin], m_traceParams[TraceParam::Origin]);
@@ -2662,7 +2794,7 @@ void SpirvLowerRayTracing::visitObjectRayOriginOp(lgc::rt::ObjectRayOriginOp &in
 // Visits "lgc.rt.object.ray.direction" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitObjectRayDirectionOp(lgc::rt::ObjectRayDirectionOp &inst) {
+void SpirvLowerRayTracingImpl::visitObjectRayDirectionOp(lgc::rt::ObjectRayDirectionOp &inst) {
   m_builder->SetInsertPoint(&inst);
 
   Value *dir = m_builder->CreateLoad(m_traceParamsTys[TraceParam::Dir], m_traceParams[TraceParam::Dir]);
@@ -2687,7 +2819,7 @@ void SpirvLowerRayTracing::visitObjectRayDirectionOp(lgc::rt::ObjectRayDirection
 // Visits "lgc.rt.tmin" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitRayTminOp(lgc::rt::RayTminOp &inst) {
+void SpirvLowerRayTracingImpl::visitRayTminOp(lgc::rt::RayTminOp &inst) {
   m_builder->SetInsertPoint(&inst);
 
   auto tMin = m_builder->CreateLoad(m_traceParamsTys[TraceParam::TMin], m_traceParams[TraceParam::TMin]);
@@ -2701,7 +2833,7 @@ void SpirvLowerRayTracing::visitRayTminOp(lgc::rt::RayTminOp &inst) {
 // Visits "lgc.rt.tcurrent" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitRayTcurrentOp(lgc::rt::RayTcurrentOp &inst) {
+void SpirvLowerRayTracingImpl::visitRayTcurrentOp(lgc::rt::RayTcurrentOp &inst) {
   m_builder->SetInsertPoint(&inst);
 
   auto tMax = m_builder->CreateLoad(m_traceParamsTys[TraceParam::TMax], m_traceParams[TraceParam::TMax]);
@@ -2715,7 +2847,7 @@ void SpirvLowerRayTracing::visitRayTcurrentOp(lgc::rt::RayTcurrentOp &inst) {
 // Visits "lgc.rt.instance.index" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitInstanceIndexOp(lgc::rt::InstanceIndexOp &inst) {
+void SpirvLowerRayTracingImpl::visitInstanceIndexOp(lgc::rt::InstanceIndexOp &inst) {
   m_builder->SetInsertPoint(&inst);
 
   auto instNodeAddr = createLoadInstNodeAddr();
@@ -2730,7 +2862,7 @@ void SpirvLowerRayTracing::visitInstanceIndexOp(lgc::rt::InstanceIndexOp &inst)
 // Visits "lgc.rt.object.to.world" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitObjectToWorldOp(lgc::rt::ObjectToWorldOp &inst) {
+void SpirvLowerRayTracingImpl::visitObjectToWorldOp(lgc::rt::ObjectToWorldOp &inst) {
   m_builder->SetInsertPoint(&inst);
 
   auto objectToWorld = createLoadRayTracingMatrix(BuiltInObjectToWorldKHR);
@@ -2744,7 +2876,7 @@ void SpirvLowerRayTracing::visitObjectToWorldOp(lgc::rt::ObjectToWorldOp &inst)
 // Visits "lgc.rt.world.to.object" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitWorldToObjectOp(lgc::rt::WorldToObjectOp &inst) {
+void SpirvLowerRayTracingImpl::visitWorldToObjectOp(lgc::rt::WorldToObjectOp &inst) {
   m_builder->SetInsertPoint(&inst);
 
   m_worldToObjMatrix = !m_worldToObjMatrix ? createLoadRayTracingMatrix(BuiltInWorldToObjectKHR) : m_worldToObjMatrix;
@@ -2758,7 +2890,7 @@ void SpirvLowerRayTracing::visitWorldToObjectOp(lgc::rt::WorldToObjectOp &inst)
 // Visits "lgc.rt.hit.kind" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitHitKindOp(lgc::rt::HitKindOp &inst) {
+void SpirvLowerRayTracingImpl::visitHitKindOp(lgc::rt::HitKindOp &inst) {
   m_builder->SetInsertPoint(&inst);
 
   auto hitKind = m_builder->CreateLoad(m_traceParamsTys[TraceParam::Kind], m_traceParams[TraceParam::Kind]);
@@ -2772,7 +2904,7 @@ void SpirvLowerRayTracing::visitHitKindOp(lgc::rt::HitKindOp &inst) {
 // Visits "lgc.rt.triangle.vertex.position" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitTriangleVertexPositionsOp(lgc::rt::TriangleVertexPositionsOp &inst) {
+void SpirvLowerRayTracingImpl::visitTriangleVertexPositionsOp(lgc::rt::TriangleVertexPositionsOp &inst) {
   m_builder->SetInsertPoint(&inst);
 
   auto triangleVertexPositions = m_builder->CreateLoad(m_traceParamsTys[TraceParam::HitTriangleVertexPositions],
@@ -2794,7 +2926,7 @@ void SpirvLowerRayTracing::visitTriangleVertexPositionsOp(lgc::rt::TriangleVerte
 // Visits "lgc.rt.ray.flags" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitRayFlagsOp(lgc::rt::RayFlagsOp &inst) {
+void SpirvLowerRayTracingImpl::visitRayFlagsOp(lgc::rt::RayFlagsOp &inst) {
   m_builder->SetInsertPoint(&inst);
 
   auto rayFlags = m_builder->CreateLoad(m_traceParamsTys[TraceParam::RayFlags], m_traceParams[TraceParam::RayFlags]);
@@ -2808,7 +2940,7 @@ void SpirvLowerRayTracing::visitRayFlagsOp(lgc::rt::RayFlagsOp &inst) {
 // Visits "lgc.rt.geometry.index" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitGeometryIndexOp(lgc::rt::GeometryIndexOp &inst) {
+void SpirvLowerRayTracingImpl::visitGeometryIndexOp(lgc::rt::GeometryIndexOp &inst) {
   m_builder->SetInsertPoint(&inst);
 
   auto geometryIndex =
@@ -2823,7 +2955,7 @@ void SpirvLowerRayTracing::visitGeometryIndexOp(lgc::rt::GeometryIndexOp &inst)
 // Visits "lgc.rt.instance.id" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitInstanceIdOp(lgc::rt::InstanceIdOp &inst) {
+void SpirvLowerRayTracingImpl::visitInstanceIdOp(lgc::rt::InstanceIdOp &inst) {
   m_builder->SetInsertPoint(&inst);
 
   auto instNodeAddr = createLoadInstNodeAddr();
@@ -2838,7 +2970,7 @@ void SpirvLowerRayTracing::visitInstanceIdOp(lgc::rt::InstanceIdOp &inst) {
 // Visits "lgc.rt.primitive.index" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitPrimitiveIndexOp(lgc::rt::PrimitiveIndexOp &inst) {
+void SpirvLowerRayTracingImpl::visitPrimitiveIndexOp(lgc::rt::PrimitiveIndexOp &inst) {
   m_builder->SetInsertPoint(&inst);
 
   auto primitiveIndex =
@@ -2853,7 +2985,7 @@ void SpirvLowerRayTracing::visitPrimitiveIndexOp(lgc::rt::PrimitiveIndexOp &inst
 // Visits "lgc.rt.instance.inclusion.mask" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitInstanceInclusionMaskOp(lgc::rt::InstanceInclusionMaskOp &inst) {
+void SpirvLowerRayTracingImpl::visitInstanceInclusionMaskOp(lgc::rt::InstanceInclusionMaskOp &inst) {
   m_builder->SetInsertPoint(&inst);
 
   auto cullMask = m_builder->CreateLoad(m_traceParamsTys[TraceParam::InstanceInclusionMask],
@@ -2867,7 +2999,7 @@ void SpirvLowerRayTracing::visitInstanceInclusionMaskOp(lgc::rt::InstanceInclusi
 // Visits "lgc.rt.shader.index" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitShaderIndexOp(lgc::rt::ShaderIndexOp &inst) {
+void SpirvLowerRayTracingImpl::visitShaderIndexOp(lgc::rt::ShaderIndexOp &inst) {
   // FIXME: This could be wrong if lgc.rt.shader.index is not in the same function as m_shaderRecordIndex, but is
   // this really the case?
   inst.replaceAllUsesWith(m_shaderRecordIndex);
@@ -2880,7 +3012,7 @@ void SpirvLowerRayTracing::visitShaderIndexOp(lgc::rt::ShaderIndexOp &inst) {
 // Visits "lgc.rt.shader.record.buffer" instructions
 //
 // @param inst : The instruction
-void SpirvLowerRayTracing::visitShaderRecordBufferOp(lgc::rt::ShaderRecordBufferOp &inst) {
+void SpirvLowerRayTracingImpl::visitShaderRecordBufferOp(lgc::rt::ShaderRecordBufferOp &inst) {
   m_builder->SetInsertPoint(m_insertPosPastInit);
 
   auto tableIndex = inst.getShaderIndex();
@@ -2925,12 +3057,12 @@ void SpirvLowerRayTracing::visitShaderRecordBufferOp(lgc::rt::ShaderRecordBuffer
   Value *shaderIdsSizeVal = m_builder->getInt32(shaderIdsSize);
 
   tableAddr = m_builder->CreateAdd(tableAddr, m_builder->CreateZExt(shaderIdsSizeVal, m_builder->getInt64Ty()));
-  tableAddr = m_builder->create<lgc::StridedBufferAddrAndStrideToPtrOp>(tableAddr, tableStride);
+  tableAddr = m_builder->create<lgc::StridedBufferAddrAndStrideToPtrOp>(tableAddr, tableStride, false);
   tableAddr = m_builder->create<lgc::StridedIndexAddOp>(tableAddr, tableIndex);
 
   SmallVector<Instruction *> toRemove;
   toRemove.push_back(&inst);
-  replaceAllPointerUses(m_builder, &inst, tableAddr, toRemove);
+  replaceAllPointerUses(&inst, tableAddr, toRemove);
 
   for (auto *I : reverse(toRemove))
     I->eraseFromParent();
@@ -2940,7 +3072,7 @@ void SpirvLowerRayTracing::visitShaderRecordBufferOp(lgc::rt::ShaderRecordBuffer
 // Creates instructions to emit SQTT shader data call compact token
 //
 // @param stage : Ray tracing shader stage
-void SpirvLowerRayTracing::createSqttCallCompactToken(ShaderStage stage) {
+void SpirvLowerRayTracingImpl::createSqttCallCompactToken(ShaderStage stage) {
   // The token is a 32-bit uint compacted with following bit representation:
   // 31-13: extended data, 12-8: data_tokens, 7: extended, 6: special, 5-0: well_known
   // If extended is 0, this is a well known packet type, and data_tokens and extended_data may be interpreted as
@@ -3002,14 +3134,14 @@ void SpirvLowerRayTracing::createSqttCallCompactToken(ShaderStage stage) {
 
 // =====================================================================================================================
 // Creates instructions to emit SQTT shader data function return token
-void SpirvLowerRayTracing::createSqttFunctionReturnToken() {
+void SpirvLowerRayTracingImpl::createSqttFunctionReturnToken() {
   m_builder->CreateIntrinsic(Intrinsic::amdgcn_s_ttracedata_imm, {},
                              m_builder->getInt16(SqttWellKnownTypeFunctionReturn));
 }
 
 // =====================================================================================================================
 // Creates instructions to load instance node address
-Value *SpirvLowerRayTracing::createLoadInstNodeAddr() {
+Value *SpirvLowerRayTracingImpl::createLoadInstNodeAddr() {
   auto instNodeAddrTy = m_traceParamsTys[TraceParam::InstNodeAddrLo];
   assert(instNodeAddrTy == m_traceParamsTys[TraceParam::InstNodeAddrHi]);
   Value *instNodeAddrLo = m_builder->CreateLoad(instNodeAddrTy, m_traceParams[TraceParam::InstNodeAddrLo]);
@@ -3028,7 +3160,7 @@ Value *SpirvLowerRayTracing::createLoadInstNodeAddr() {
 //
 // @param inst : The instruction
 // @param args : Additional TraceSet arguments
-llvm::Function *SpirvLowerRayTracing::createImplFunc(CallInst &inst, ArrayRef<Value *> args) {
+llvm::Function *SpirvLowerRayTracingImpl::createImplFunc(CallInst &inst, ArrayRef<Value *> args) {
   std::string mangledName = inst.getCalledFunction()->getName().str() + ".impl";
   SmallVector<Value *, 10> implCallArgs(inst.args());
   for (auto &arg : args) {
@@ -3042,7 +3174,7 @@ llvm::Function *SpirvLowerRayTracing::createImplFunc(CallInst &inst, ArrayRef<Va
   return m_module->getFunction(mangledName);
 }
 
-lgc::rt::RayTracingShaderStage SpirvLowerRayTracing::mapStageToLgcRtShaderStage(ShaderStage stage) {
+lgc::rt::RayTracingShaderStage SpirvLowerRayTracingImpl::mapStageToLgcRtShaderStage(ShaderStage stage) {
   assert((stage >= ShaderStageRayTracingRayGen) && (stage <= ShaderStageRayTracingCallable));
   return static_cast<lgc::rt::RayTracingShaderStage>(stage - ShaderStageRayTracingRayGen);
 }
@@ -3050,7 +3182,7 @@ lgc::rt::RayTracingShaderStage SpirvLowerRayTracing::mapStageToLgcRtShaderStage(
 // =====================================================================================================================
 // Generate a static ID for current Trace Ray call
 //
-unsigned SpirvLowerRayTracing::generateTraceRayStaticId() {
+unsigned SpirvLowerRayTracingImpl::generateTraceRayStaticId() {
   Util::MetroHash64 hasher;
   hasher.Update(m_nextTraceRayId++);
   hasher.Update(m_module->getName().bytes_begin(), m_module->getName().size());
@@ -3065,7 +3197,7 @@ unsigned SpirvLowerRayTracing::generateTraceRayStaticId() {
 // Erase BasicBlocks from the Function
 //
 // @param func : Function
-void SpirvLowerRayTracing::eraseFunctionBlocks(Function *func) {
+void SpirvLowerRayTracingImpl::eraseFunctionBlocks(Function *func) {
   for (auto blockIt = func->begin(), blockEnd = func->end(); blockIt != blockEnd;) {
     BasicBlock *basicBlock = &*blockIt++;
     basicBlock->dropAllReferences();
@@ -3077,7 +3209,7 @@ void SpirvLowerRayTracing::eraseFunctionBlocks(Function *func) {
 // Call GpuRt Library Func to load a 3x4 matrix from given address at the current insert point
 //
 // @param instanceNodeAddr : instanceNode address, which type is i64
-Value *SpirvLowerRayTracing::createLoadMatrixFromFunc(Value *instanceNodeAddr, unsigned builtInId) {
+Value *SpirvLowerRayTracingImpl::createLoadMatrixFromFunc(Value *instanceNodeAddr, unsigned builtInId) {
   auto floatx3Ty = FixedVectorType::get(m_builder->getFloatTy(), 3);
   auto matrixTy = ArrayType::get(floatx3Ty, 4);
 
@@ -3126,7 +3258,7 @@ Value *SpirvLowerRayTracing::createLoadMatrixFromFunc(Value *instanceNodeAddr, u
 
 // =====================================================================================================================
 // Looks up an exported function in the GPURT module
-Function *SpirvLowerRayTracing::getGpurtFunction(StringRef name) {
+Function *SpirvLowerRayTracingImpl::getGpurtFunction(StringRef name) {
   auto &gpurtContext = lgc::GpurtContext::get(*m_context);
   Function *fn = gpurtContext.theModule->getFunction(name);
   assert(fn);
@@ -3139,7 +3271,7 @@ Function *SpirvLowerRayTracing::getGpurtFunction(StringRef name) {
 // So "isIndex = true" means we use InstanceId(InstanceIndex for GPURT) for vulkan,
 // and "isIndex = false" means we use InstanceIndex(InstanceId for GPURT) for vulkan,
 // @param instNodeAddr : 64-bit instance node address, in <2 x i32>
-Value *SpirvLowerRayTracing::createLoadInstanceIndexOrId(Value *instNodeAddr, bool isIndex) {
+Value *SpirvLowerRayTracingImpl::createLoadInstanceIndexOrId(Value *instNodeAddr, bool isIndex) {
   Value *instanceIdPtr = m_builder->CreateAllocaAtFuncEntry(m_builder->getInt64Ty());
   m_builder->CreateStore(instNodeAddr, instanceIdPtr);
 
@@ -3152,4 +3284,11 @@ Value *SpirvLowerRayTracing::createLoadInstanceIndexOrId(Value *instNodeAddr, bo
   return cmiResult.returnValue;
 }
 
+// =====================================================================================================================
+// Run the pass with a one-time instantiation of the impl class. This ensures that each run starts with a clean state.
+PreservedAnalyses SpirvLowerRayTracing::run(Module &module, ModuleAnalysisManager &analysisManager) {
+  SpirvLowerRayTracingImpl impl;
+  return impl.run(module, analysisManager);
+}
+
 } // namespace Llpc
diff --git a/llpc/lowering/LowerRayTracing.h b/llpc/lowering/LowerRayTracing.h
index 5a93734527..06e911b449 100644
--- a/llpc/lowering/LowerRayTracing.h
+++ b/llpc/lowering/LowerRayTracing.h
@@ -191,131 +191,11 @@ enum RayFlag : unsigned {
 
 // =====================================================================================================================
 // Represents the pass of SPIR-V lowering ray tracing.
-class SpirvLowerRayTracing : public SpirvLower, public llvm::PassInfoMixin<SpirvLowerRayTracing> {
+class SpirvLowerRayTracing : public llvm::PassInfoMixin<SpirvLowerRayTracing> {
 public:
-  SpirvLowerRayTracing();
   llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager);
 
   static llvm::StringRef name() { return "Lower SPIR-V RayTracing operations"; }
-
-private:
-  void eraseFunctionBlocks(llvm::Function *func);
-  llvm::Value *createLoadInstanceIndexOrId(Value *instNodeAddr, bool isIndex);
-  llvm::Value *createLoadMatrixFromFunc(llvm::Value *matrixAddr, unsigned builtInId);
-  llvm::Function *getGpurtFunction(llvm::StringRef name);
-  void createTraceParams(llvm::Function *func);
-  void createRayGenEntryFunc();
-  unsigned generateTraceRayStaticId();
-  void processShaderRecordBuffer(llvm::GlobalVariable *global, llvm::Value *bufferDesc, llvm::Value *tableIndex,
-                                 llvm::Instruction *insertPos);
-  llvm::CallInst *createTraceRay();
-  void createSetHitAttributes(llvm::Function *func, unsigned instArgsNum, unsigned traceParamsOffset);
-  void createSetTraceParams(llvm::Function *func, unsigned instArgNum);
-  void createAnyHitFunc(llvm::Value *shaderIdentifier, llvm::Value *shaderRecordIndex);
-  void createCallShaderFunc(llvm::Function *func, ShaderStage stage, unsigned intersectId, llvm::Value *retVal,
-                            unsigned traceParamsArgOffset);
-  void createCallShader(llvm::Function *func, ShaderStage stage, unsigned intersectId, llvm::Value *shaderId,
-                        llvm::Value *shaderRecordIndex, llvm::Value *inputResult, llvm::BasicBlock *entryBlock,
-                        llvm::BasicBlock *endBlock, unsigned traceParamsArgOffset);
-  void updateGlobalFromCallShaderFunc(llvm::Function *func, ShaderStage stage, unsigned traceParamsArgOffset);
-  void createSetTriangleInsection(llvm::Function *func);
-  void createShaderSelection(llvm::Function *func, llvm::BasicBlock *entryBlock, llvm::BasicBlock *endBlock,
-                             llvm::Value *shaderId, unsigned intersectId, ShaderStage stage,
-                             llvm::ArrayRef<llvm::Value *> args, llvm::Value *result, llvm::Type *inResultTy);
-  llvm::Value *loadShaderTableVariable(ShaderTable tableKind, llvm::Value *bufferDesc);
-  llvm::Value *getShaderIdentifier(ShaderStage stage, llvm::Value *shaderRecordIndex, llvm::Value *bufferDesc);
-  void createDbgInfo(llvm::Module &module, llvm::Function *func);
-  void processTerminalFunc(llvm::Function *func, llvm::CallInst *inst, RayHitStatus hitStatus);
-  void initTraceParamsTy(unsigned attributeSize);
-  void initShaderBuiltIns();
-  void inlineTraceRay(llvm::CallInst *callInst, ModuleAnalysisManager &analysisManager);
-  llvm::Instruction *createEntryFunc(llvm::Function *func);
-  void createEntryTerminator(llvm::Function *func);
-  llvm::FunctionType *getShaderEntryFuncTy(ShaderStage stage, llvm::SmallVectorImpl<llvm::StringRef> &argNames);
-  llvm::FunctionType *getCallableShaderEntryFuncTy(llvm::SmallVectorImpl<llvm::StringRef> &argNames);
-  llvm::FunctionType *getTraceRayFuncTy();
-  void createDispatchRaysInfoDesc();
-  llvm::Instruction *createCallableShaderEntryFunc(llvm::Function *func);
-  void createCallableShaderEntryTerminator(llvm::Function *func);
-  llvm::SmallVector<llvm::Instruction *> getFuncRets(llvm::Function *func) const;
-  llvm::SmallSet<unsigned, 4> getShaderExtraInputParams(ShaderStage stage);
-  llvm::SmallSet<unsigned, 4> getShaderExtraRets(ShaderStage stage);
-  llvm::Type *getShaderReturnTy(ShaderStage stage);
-  void storeFunctionCallResult(ShaderStage stage, llvm::Value *result, llvm::Argument *traceIt);
-  void initInputResult(ShaderStage stage, llvm::Value *payload, llvm::Value *traceParams[], llvm::Value *result,
-                       llvm::Argument *traceIt);
-  void cloneDbgInfoSubgrogram(llvm::Function *func, llvm::Function *newfunc);
-  llvm::Value *createLoadRayTracingMatrix(unsigned builtInId);
-  void createSetHitTriangleNodePointer(llvm::Function *func);
-  llvm::Function *getOrCreateRemapCapturedVaToReplayVaFunc();
-
-  void visitAcceptHitAndEndSearchOp(lgc::rt::AcceptHitAndEndSearchOp &inst);
-  void visitIgnoreHitOp(lgc::rt::IgnoreHitOp &inst);
-  void visitCallCallableShaderOp(lgc::rt::CallCallableShaderOp &inst);
-  void visitReportHitOp(lgc::rt::ReportHitOp &inst);
-  void visitTraceRayOp(lgc::rt::TraceRayOp &inst);
-  void processTraceRayCall(lgc::rt::BaseTraceRayOp *inst);
-
-  llvm::Function *createImplFunc(llvm::CallInst &inst, llvm::ArrayRef<Value *> args);
-
-  void visitGetHitAttributes(lgc::GpurtGetHitAttributesOp &inst);
-  void visitSetHitAttributes(lgc::GpurtSetHitAttributesOp &inst);
-  void visitSetTraceParams(lgc::GpurtSetTraceParamsOp &inst);
-  void visitCallClosestHitShader(lgc::GpurtCallClosestHitShaderOp &inst);
-  void visitCallMissShader(lgc::GpurtCallMissShaderOp &inst);
-  void visitCallTriangleAnyHitShader(lgc::GpurtCallTriangleAnyHitShaderOp &inst);
-  void visitCallIntersectionShader(lgc::GpurtCallIntersectionShaderOp &inst);
-  void visitSetTriangleIntersectionAttributes(lgc::GpurtSetTriangleIntersectionAttributesOp &inst);
-  void visitSetHitTriangleNodePointer(lgc::GpurtSetHitTriangleNodePointerOp &inst);
-  void visitGetParentId(lgc::GpurtGetParentIdOp &inst);
-  void visitSetParentId(lgc::GpurtSetParentIdOp &inst);
-  void visitGetRayStaticId(lgc::GpurtGetRayStaticIdOp &inst);
-  void visitStackReadOp(lgc::GpurtStackReadOp &inst);
-  void visitStackWriteOp(lgc::GpurtStackWriteOp &inst);
-  void visitLdsStackInitOp(lgc::GpurtLdsStackInitOp &inst);
-  void visitDispatchRayIndex(lgc::rt::DispatchRaysIndexOp &inst);
-  void visitDispatchRaysDimensionsOp(lgc::rt::DispatchRaysDimensionsOp &inst);
-  void visitWorldRayOriginOp(lgc::rt::WorldRayOriginOp &inst);
-  void visitWorldRayDirectionOp(lgc::rt::WorldRayDirectionOp &inst);
-  void visitObjectRayOriginOp(lgc::rt::ObjectRayOriginOp &inst);
-  void visitObjectRayDirectionOp(lgc::rt::ObjectRayDirectionOp &inst);
-  void visitRayTminOp(lgc::rt::RayTminOp &inst);
-  void visitRayTcurrentOp(lgc::rt::RayTcurrentOp &inst);
-  void visitInstanceIndexOp(lgc::rt::InstanceIndexOp &inst);
-  void visitObjectToWorldOp(lgc::rt::ObjectToWorldOp &inst);
-  void visitWorldToObjectOp(lgc::rt::WorldToObjectOp &inst);
-  void visitHitKindOp(lgc::rt::HitKindOp &inst);
-  void visitTriangleVertexPositionsOp(lgc::rt::TriangleVertexPositionsOp &inst);
-  void visitRayFlagsOp(lgc::rt::RayFlagsOp &inst);
-  void visitGeometryIndexOp(lgc::rt::GeometryIndexOp &inst);
-  void visitInstanceIdOp(lgc::rt::InstanceIdOp &inst);
-  void visitPrimitiveIndexOp(lgc::rt::PrimitiveIndexOp &inst);
-  void visitInstanceInclusionMaskOp(lgc::rt::InstanceInclusionMaskOp &inst);
-  void visitShaderIndexOp(lgc::rt::ShaderIndexOp &inst);
-  void visitShaderRecordBufferOp(lgc::rt::ShaderRecordBufferOp &inst);
-
-  void createSqttCallCompactToken(ShaderStage stage);
-  void createSqttFunctionReturnToken();
-
-  llvm::Value *createLoadInstNodeAddr();
-
-  lgc::rt::RayTracingShaderStage mapStageToLgcRtShaderStage(ShaderStage stage);
-  std::optional<CompilerUtils::CrossModuleInliner> m_crossModuleInliner;
-  unsigned m_spirvOpMetaKindId; // Metadata kind ID for "spirv.op"
-
-  llvm::Value *m_traceParams[TraceParam::Count]; // Trace ray set parameters
-  llvm::StringRef m_traceParamNames[TraceParam::Count];
-  llvm::Value *m_worldToObjMatrix = nullptr;               // World to Object matrix
-  llvm::AllocaInst *m_callableData = nullptr;              // Callable data variable for current callable shader
-  std::set<unsigned, std::less<unsigned>> m_builtInParams; // Indirect max builtins;
-  llvm::SmallVector<llvm::Type *, TraceParam::Count> m_traceParamsTys; // Trace Params types
-  llvm::SmallVector<llvm::Instruction *> m_callsToLower;               // Call instruction to lower
-  llvm::SmallSet<llvm::Function *, 4> m_funcsToLower;                  // Functions to lower
-  llvm::Value *m_dispatchRaysInfoDesc = nullptr;                       // Descriptor of the DispatchRaysInfo
-  llvm::Value *m_shaderRecordIndex = nullptr;                          // Variable sourced from entry function argument
-  llvm::Instruction *m_insertPosPastInit = nullptr; // Insert position after initialization instructions (storing trace
-                                                    // parameters, payload, callable data, etc.)
-  unsigned m_nextTraceRayId;                        // Next trace ray ID to be used for ray history
 };
 
 } // namespace Llpc
diff --git a/llpc/lowering/Lowering.cpp b/llpc/lowering/Lowering.cpp
index 53ac3a9bdf..d8e3a4bde5 100644
--- a/llpc/lowering/Lowering.cpp
+++ b/llpc/lowering/Lowering.cpp
@@ -97,7 +97,7 @@ namespace Llpc {
 // @param [in/out] passMgr : Pass manager to add passes to
 // @param lowerTimer : Timer to time lower passes with, nullptr if not timing
 // @param lowerFlag : Add the required pass based on the flag
-void SpirvLower::addPasses(Context *context, ShaderStage stage, lgc::PassManager &passMgr, Timer *lowerTimer,
+void SpirvLower::addPasses(Context *context, ShaderStage stage, ModulePassManager &passMgr, Timer *lowerTimer,
                            LowerFlag lowerFlag) {
   // Start timer for lowering passes.
   if (lowerTimer)
@@ -202,20 +202,26 @@ void SpirvLower::addPasses(Context *context, ShaderStage stage, lgc::PassManager
 // Register all the translation passes into the given pass manager
 //
 // @param [in/out] passMgr : Pass manager
-void SpirvLower::registerTranslationPasses(lgc::PassManager &passMgr) {
+template <typename PassManagerT> void SpirvLower::registerTranslationPasses(PassManagerT &passMgr) {
   passMgr.registerPass("lower-translator", LowerTranslator::name());
   passMgr.registerPass("lower-gpurt-library", ProcessGpuRtLibrary::name());
 }
 
+template void SpirvLower::registerTranslationPasses<lgc::PassManager>(lgc::PassManager &);
+template void SpirvLower::registerTranslationPasses<lgc::MbPassManager>(lgc::MbPassManager &);
+
 // =====================================================================================================================
 // Register all the lowering passes into the given pass manager
 //
 // @param [in/out] passMgr : Pass manager
-void SpirvLower::registerLoweringPasses(lgc::PassManager &passMgr) {
+template <typename PassManagerT> void SpirvLower::registerLoweringPasses(PassManagerT &passMgr) {
 #define LLPC_PASS(NAME, CLASS) passMgr.registerPass(NAME, CLASS::name());
 #include "PassRegistry.inc"
 }
 
+template void SpirvLower::registerLoweringPasses<lgc::PassManager>(lgc::PassManager &);
+template void SpirvLower::registerLoweringPasses<lgc::MbPassManager>(lgc::MbPassManager &);
+
 // =====================================================================================================================
 // Replace global variable with another global variable
 //
diff --git a/llpc/lowering/Lowering.h b/llpc/lowering/Lowering.h
index 22cb7ae4a4..1f30403139 100644
--- a/llpc/lowering/Lowering.h
+++ b/llpc/lowering/Lowering.h
@@ -32,6 +32,7 @@
 
 #include "llpc.h"
 #include "llpcUtil.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 
 namespace llvm {
@@ -44,6 +45,7 @@ class Timer;
 namespace lgc {
 
 class Builder;
+class MbPassManager;
 class PassManager;
 
 } // namespace lgc
@@ -70,12 +72,12 @@ class SpirvLower {
   explicit SpirvLower() {}
 
   // Add per-shader lowering passes to pass manager
-  static void addPasses(Context *context, ShaderStage stage, lgc::PassManager &passMgr, llvm::Timer *lowerTimer,
+  static void addPasses(Context *context, ShaderStage stage, llvm::ModulePassManager &passMgr, llvm::Timer *lowerTimer,
                         LowerFlag lowerFlag);
   // Register all the translation passes into the given pass manager
-  static void registerTranslationPasses(lgc::PassManager &passMgr);
+  template <typename PassManagerT> static void registerTranslationPasses(PassManagerT &passMgr);
   // Register all the lowering passes into the given pass manager
-  static void registerLoweringPasses(lgc::PassManager &passMgr);
+  template <typename PassManagerT> static void registerLoweringPasses(PassManagerT &passMgr);
 
   static void replaceGlobal(Context *context, llvm::GlobalVariable *original, llvm::GlobalVariable *replacement);
 
@@ -89,4 +91,10 @@ class SpirvLower {
   lgc::Builder *m_builder = nullptr;              // LGC builder object
 };
 
+extern template void SpirvLower::registerTranslationPasses<lgc::PassManager>(lgc::PassManager &);
+extern template void SpirvLower::registerTranslationPasses<lgc::MbPassManager>(lgc::MbPassManager &);
+
+extern template void SpirvLower::registerLoweringPasses<lgc::PassManager>(lgc::PassManager &);
+extern template void SpirvLower::registerLoweringPasses<lgc::MbPassManager>(lgc::MbPassManager &);
+
 } // namespace Llpc
diff --git a/llpc/lowering/LoweringUtil.cpp b/llpc/lowering/LoweringUtil.cpp
index a05f4bc48a..a519418736 100644
--- a/llpc/lowering/LoweringUtil.cpp
+++ b/llpc/lowering/LoweringUtil.cpp
@@ -35,6 +35,7 @@
 #include "llpcUtil.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Linker/Linker.h"
 
 using namespace llvm;
 
@@ -134,7 +135,9 @@ BasicBlock *clearBlock(Function *func) {
 // Clear non entry external functions
 // @param module : LLVM module to remove functions.
 // @param entryName : Entry Function Name
-void clearNonEntryFunctions(Module *module, StringRef entryName) {
+// @return whether anything was changed
+bool clearNonEntryFunctions(Module *module, StringRef entryName) {
+  bool change = false;
   for (auto funcIt = module->begin(), funcEnd = module->end(); funcIt != funcEnd;) {
     Function *func = &*funcIt++;
     if ((func->getLinkage() == GlobalValue::ExternalLinkage || func->getLinkage() == GlobalValue::WeakAnyLinkage) &&
@@ -142,9 +145,32 @@ void clearNonEntryFunctions(Module *module, StringRef entryName) {
       if (!func->getName().starts_with(entryName)) {
         func->dropAllReferences();
         func->eraseFromParent();
+        change = true;
       }
     }
   }
+  return change;
+}
+
+// =====================================================================================================================
+// Run the ClearNonEntryFunctions pass on the given Module.
+PreservedAnalyses ClearNonEntryFunctionsPass::run(Module &module, ModuleAnalysisManager &analysisManager) {
+  return clearNonEntryFunctions(&module, m_entryName) ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
+
+// =====================================================================================================================
+// Run the MergeModules pass on the given ModuleBunch.
+PreservedAnalyses MergeModulesPass::run(ModuleBunch &moduleBunch, ModuleBunchAnalysisManager &analysisManager) {
+  auto modules = moduleBunch.getMutableModules();
+  if (modules.size() < 2)
+    return PreservedAnalyses::all();
+
+  Linker linker(*modules[0]);
+  for (auto &module : modules.drop_front())
+    linker.linkInModule(std::move(module));
+
+  moduleBunch.renormalize();
+  return PreservedAnalyses::none();
 }
 
 // =====================================================================================================================
diff --git a/llpc/lowering/LoweringUtil.h b/llpc/lowering/LoweringUtil.h
index aa40d7867c..2cf9e74e6b 100644
--- a/llpc/lowering/LoweringUtil.h
+++ b/llpc/lowering/LoweringUtil.h
@@ -32,6 +32,8 @@
 
 #include "SPIRVInternal.h"
 #include "llpc.h"
+#include "compilerutils/ModuleBunch.h"
+#include "llvm/IR/PassManager.h"
 
 namespace llvm {
 
@@ -75,8 +77,25 @@ void getEntryPoints(llvm::Module *module, llvm::SmallVectorImpl<llvm::Function *
 
 // Clears the empty block
 llvm::BasicBlock *clearBlock(llvm::Function *func);
+
 // Clear non entry external functions
-void clearNonEntryFunctions(llvm::Module *module, llvm::StringRef entryName);
+bool clearNonEntryFunctions(llvm::Module *module, llvm::StringRef entryName);
+
+class ClearNonEntryFunctionsPass : public llvm::PassInfoMixin<ClearNonEntryFunctionsPass> {
+public:
+  ClearNonEntryFunctionsPass(llvm::StringRef entryName) : m_entryName(entryName) {}
+
+  llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager);
+
+private:
+  std::string m_entryName;
+};
+
+// Merge (link) all modules in a ModuleBunch into a single module.
+class MergeModulesPass : public llvm::PassInfoMixin<MergeModulesPass> {
+public:
+  llvm::PreservedAnalyses run(llvm::ModuleBunch &moduleBunch, llvm::ModuleBunchAnalysisManager &analysisManager);
+};
 
 // Get in/out meta data recursively.
 void decodeInOutMetaRecursively(llvm::Type *valueTy, llvm::Constant *mds, llvm::SmallVector<ShaderInOutMetadata> &out);
diff --git a/llpc/lowering/PrepareTransformVertexShader.cpp b/llpc/lowering/PrepareTransformVertexShader.cpp
new file mode 100644
index 0000000000..ae66d23fc2
--- /dev/null
+++ b/llpc/lowering/PrepareTransformVertexShader.cpp
@@ -0,0 +1,183 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+/**
+***********************************************************************************************************************
+* @file  PrepareTransformVertexShader.cpp
+* @brief Prepare a vertex shader for linking into a transform compute shader.
+***********************************************************************************************************************
+*/
+#include "PrepareTransformVertexShader.h"
+#include "LoweringUtil.h"
+#include "llpcContext.h"
+#include "compilerutils/CompilerUtils.h"
+#include "lgc/Builder.h"
+#include "llvm/IR/Instructions.h"
+
+using namespace lgc;
+using namespace llvm;
+using namespace CompilerUtils;
+
+namespace Llpc {
+#define DEBUG_TYPE "prepare-transform-shader"
+static const char TransformVsEntry[] = "TransformVertexEntry";
+
+// =====================================================================================================================
+// Executes this SPIR-V lowering pass on the specified LLVM module.
+//
+// @param [in/out] module : LLVM module to be run on
+// @param [in/out] analysisManager : Analysis manager to use for this transformation
+PreservedAnalyses PrepareTransformVertexShader::run(Module &module, ModuleAnalysisManager &analysisManager) {
+  LLVM_DEBUG(dbgs() << "Run the pass Prepare-transform-vertexShader\n");
+  SpirvLower::init(&module);
+  collectVsOutputSymbols(module);
+
+  Function *func = module.getFunction("main");
+  if (func != nullptr)
+    genFunTransformVertex(*func);
+
+  return PreservedAnalyses::none();
+}
+
+// =====================================================================================================================
+// Collect Vertex shader output builtins: gl_Position, gl_ClipDistance[], gl_FrontColor, gl_TexCoord
+//
+// @param [in/out] module : LLVM module to be run on
+void PrepareTransformVertexShader::collectVsOutputSymbols(Module &module) {
+  for (auto &global : module.globals()) {
+    auto type = global.getType();
+    if (type->getAddressSpace() == SPIRAS_Output) {
+      MDNode *metaNode = global.getMetadata(gSPIRVMD::InOut);
+      assert(metaNode);
+      auto inOutMetaConst = mdconst::dyn_extract<Constant>(metaNode->getOperand(0));
+      auto valueType = global.getValueType();
+      SmallVector<ShaderInOutMetadata> mds;
+      decodeInOutMetaRecursively(valueType, inOutMetaConst, mds);
+
+      for (auto md : mds) {
+        if (md.IsBuiltIn) {
+          if (md.Value == spv::BuiltInPosition) {
+            m_outputBuiltIns[TransformVertexVariable::Position] = &global;
+          } else if (md.Value == spv::BuiltInClipDistance) {
+            m_outputBuiltIns[TransformVertexVariable::ClipDistance0] = &global;
+            m_outputBuiltIns[TransformVertexVariable::ClipDistance1] = &global;
+          }
+        } else {
+          if (md.IsLoc) {
+            if (md.Value == Vkgc::GlCompatibilityInOutLocation::FrontColor) {
+              m_outputBuiltIns[TransformVertexVariable::FrontColor] = &global;
+            } else if (md.Value == Vkgc::GlCompatibilityInOutLocation::TexCoord) {
+              m_outputBuiltIns[TransformVertexVariable::TexCoord] = &global;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// =====================================================================================================================
+// Load the clip distance component from structure member
+//
+// @param [in] index : Indicates which variable is being accessed, TransformClipDistance0 or TransformClipDistance1
+// @param [in] component : Indicates which component of the clip distance to load
+// @returns Value of the component
+Value *PrepareTransformVertexShader::loadClipDistanceComponent(unsigned index, unsigned component) {
+  auto clipDistance = cast<GlobalVariable>(m_outputBuiltIns[index]);
+  Type *ty = clipDistance->getValueType();
+  auto arraySize = ty->getArrayNumElements();
+  auto floatType = m_builder->getFloatTy();
+
+  unsigned redirectIdx = (index == TransformVertexVariable::ClipDistance1) ? component + 4 : component;
+  if (redirectIdx < arraySize) {
+    return m_builder->CreateLoad(floatType, m_builder->CreateConstGEP2_32(ty, clipDistance, 0, redirectIdx));
+  } else {
+    return ConstantFP::get(floatType, 1.0);
+  }
+}
+
+// =====================================================================================================================
+// Generate transform vertex shader: TransformVertexEntry
+//
+// @param [in] function : The main function of the original vertex shader
+void PrepareTransformVertexShader::genFunTransformVertex(Function &function) {
+  // 1. Create a structure to store vs output: gl_Position, gl_ClipDistance[0~3], gl_ClipDistance[4~7],
+  // gl_FrontColor and gl_TexCoord
+  auto floatType = m_builder->getFloatTy();
+  Type *vec4Type = VectorType::get(floatType, 4, false);
+  auto structTy = StructType::get(*m_context, {vec4Type, vec4Type, vec4Type, vec4Type, vec4Type});
+  Value *vsOutput = PoisonValue::get(structTy);
+
+  // 2. Handle early returns
+  m_unifiedReturn = CompilerUtils::unifyReturns(function, *m_builder);
+  m_builder->SetInsertPoint(m_unifiedReturn);
+
+  // 3. Store gl_Position, gl_ClipDistance, gl_FrontColor and gl_TextureCoord[0] in the struct
+  // If any of these variables are not existed, set them to the default value vec4(1.0f)
+  auto floatOne = ConstantFP::get(floatType, 1.0);
+  auto vecOne = ConstantVector::get({floatOne, floatOne, floatOne, floatOne});
+
+  for (unsigned idx = 0; idx < TransformVertexVariable::Count; idx++) {
+    Value *memberValue;
+    if (m_outputBuiltIns[idx] != nullptr) {
+      // gl_ClipDistance need to be handled specially
+      if (idx == TransformVertexVariable::ClipDistance0 || idx == TransformVertexVariable::ClipDistance1) {
+        memberValue = PoisonValue::get(vec4Type);
+        for (unsigned i = 0; i < 4; i++) {
+          Value *clipValue = loadClipDistanceComponent(idx, i);
+          memberValue = m_builder->CreateInsertElement(memberValue, clipValue, i);
+        }
+      } else {
+        memberValue = m_builder->CreateLoad(vec4Type, m_outputBuiltIns[idx]);
+      }
+    } else {
+      memberValue = vecOne;
+    }
+    vsOutput = m_builder->CreateInsertValue(vsOutput, memberValue, idx);
+  }
+
+  // 4. Remove the instruction of "return void", insert the instruction for the new return
+  m_builder->CreateRet(vsOutput);
+  m_unifiedReturn->eraseFromParent();
+
+  // 5. Create a new function as following
+  //  { <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float> }
+  //  @transform_vs_entry(i32 %vertexId, i32 %InstanceId, i32 %drawId, i32 %baseVertex, i32 %baseInstance)
+  auto int32Ty = m_builder->getInt32Ty();
+  SmallVector<Type *, TransformVertexVariable::Count> allArgTys = {int32Ty, int32Ty, int32Ty, int32Ty, int32Ty};
+  Function *transformVertexFunc = mutateFunctionArguments(function, structTy, allArgTys, function.getAttributes());
+  transformVertexFunc->setName(TransformVsEntry);
+
+  // 6. Transfer function body from old function to new function.
+  while (!function.empty()) {
+    BasicBlock *block = &function.front();
+    block->removeFromParent();
+    block->insertInto(transformVertexFunc);
+  }
+
+  // 7. Remove the old main function and its metadata
+  function.dropAllReferences();
+  function.getParent()->getFunctionList().remove(&function);
+}
+} // namespace Llpc
diff --git a/llpc/lowering/PrepareTransformVertexShader.h b/llpc/lowering/PrepareTransformVertexShader.h
new file mode 100644
index 0000000000..b9b7227cc9
--- /dev/null
+++ b/llpc/lowering/PrepareTransformVertexShader.h
@@ -0,0 +1,63 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+/**
+ ***********************************************************************************************************************
+ * @file  PrepareTransformVertexShader.h
+ * @brief LLPC header file: contains declaration of Llpc::PrepareTransformVertexShader
+ ***********************************************************************************************************************
+ */
+#pragma once
+#include "Lowering.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+class Value;
+class User;
+class ReturnInst;
+} // namespace llvm
+
+namespace Llpc {
+class PrepareTransformVertexShader : public SpirvLower, public llvm::PassInfoMixin<PrepareTransformVertexShader> {
+public:
+  PrepareTransformVertexShader() {}
+  llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager);
+
+private:
+  enum TransformVertexVariable : unsigned {
+    Position = 0,
+    ClipDistance0 = 1,
+    ClipDistance1 = 2,
+    FrontColor = 3,
+    TexCoord = 4,
+    Count = 5,
+  };
+
+  void collectVsOutputSymbols(llvm::Module &module);
+  void genFunTransformVertex(llvm::Function &function);
+  llvm::Value *loadClipDistanceComponent(unsigned index, unsigned component);
+  llvm::User *m_outputBuiltIns[Count] = {nullptr};
+  llvm::ReturnInst *m_unifiedReturn = nullptr;
+};
+} // namespace Llpc
diff --git a/llpc/lowering/ProcessGpuRtLibrary.cpp b/llpc/lowering/ProcessGpuRtLibrary.cpp
index 065bb32bbd..0c1b1c3ae9 100644
--- a/llpc/lowering/ProcessGpuRtLibrary.cpp
+++ b/llpc/lowering/ProcessGpuRtLibrary.cpp
@@ -229,9 +229,6 @@ void ProcessGpuRtLibrary::processLibraryFunction(Function *&func) {
     m_builder->SetInsertPoint(clearBlock(func));
     createEnqueue(func);
     return;
-  } else if (funcName.starts_with("_AmdRestoreSystemData")) {
-    // We don't need this, leave it as dummy function so that it does nothing.
-    return;
   } else if (funcName.starts_with("_AmdValueI32Count")) {
     ContHelper::handleValueI32Count(*func, *m_builder);
     return;
@@ -306,21 +303,12 @@ void ProcessGpuRtLibrary::processLibraryFunction(Function *&func) {
     if (isAmdIntrinsic)
       newFunc->deleteBody();
 
-    // Fixup WaitAwait by removing the wait mask, and fixup [Wait]AwaitTraversal by adding a dummy return address.
-    // AwaitTraversal doesn't have a return address in HLSL because the return address is written to system data.
-    bool isWaitAwait = newFunc->getName().starts_with("_AmdWaitAwait");
-    bool isNonWaitAwait = newFunc->getName().starts_with("_AmdAwait");
-    bool isAwaitTraversal = (isWaitAwait || isNonWaitAwait) && newFunc->getName().contains("Traversal");
-    if (isWaitAwait || isAwaitTraversal) {
+    // Fixup WaitAwait by removing the wait mask.
+    if (newFunc->getName().starts_with("_AmdWaitAwait")) {
       llvm::forEachCall(*newFunc, [&](CallInst &CInst) {
         SmallVector<Value *> args(CInst.args());
         // Remove wait mask
-        if (isWaitAwait)
-          args.erase(args.begin() + 1);
-
-        // Add dummy return address
-        if (isAwaitTraversal)
-          args.insert(args.begin() + 1, PoisonValue::get(m_builder->getInt64Ty()));
+        args.erase(args.begin() + 1);
 
         m_builder->SetInsertPoint(&CInst);
         auto *newValue = m_builder->CreateNamedCall("_AmdAwait", CInst.getType(), args, {});
@@ -970,12 +958,12 @@ void ProcessGpuRtLibrary::createEnqueue(Function *func) {
   SmallVector<Value *> tailArgs;
   bool hasWaitMaskArg = funcName.contains("Wait");
   // Skip waitMask
-  unsigned retAddrArgIdx = hasWaitMaskArg ? 2 : 1;
+  const unsigned shaderIdxArgIdx = hasWaitMaskArg ? 2 : 1;
+  Value *shaderIndex = m_builder->CreateLoad(m_builder->getInt32Ty(), func->getArg(shaderIdxArgIdx));
+  const unsigned retAddrArgIdx = shaderIdxArgIdx + 1;
+
   Value *retAddr = m_builder->CreateLoad(m_builder->getInt32Ty(), func->getArg(retAddrArgIdx));
-  // Get shader-index from system-data.
-  unsigned systemDataArgIdx = retAddrArgIdx + 1;
-  tailArgs.push_back(m_builder->CreateNamedCall("_cont_GetLocalRootIndex", m_builder->getInt32Ty(),
-                                                {func->getArg(systemDataArgIdx)}, {}));
+  const unsigned systemDataArgIdx = retAddrArgIdx + 1;
   // Process system-data and arguments after.
   unsigned argIdx = systemDataArgIdx;
   while (argIdx < func->arg_size()) {
@@ -984,7 +972,7 @@ void ProcessGpuRtLibrary::createEnqueue(Function *func) {
   }
 
   // TODO: pass the levelMask correctly.
-  m_builder->create<cps::JumpOp>(addr, -1, PoisonValue::get(m_builder->getInt32Ty()), retAddr, tailArgs);
+  m_builder->create<cps::JumpOp>(addr, -1, PoisonValue::get(m_builder->getInt32Ty()), shaderIndex, retAddr, tailArgs);
   m_builder->CreateUnreachable();
 
   // Clear the name so that earlyGpurtTransform doesn't try to handle the function.
diff --git a/llpc/test/shaderdb/bugs/ArrayOfVariablePointers.spvasm b/llpc/test/shaderdb/bugs/ArrayOfVariablePointers.spvasm
index 975170a716..65f80fd69c 100644
--- a/llpc/test/shaderdb/bugs/ArrayOfVariablePointers.spvasm
+++ b/llpc/test/shaderdb/bugs/ArrayOfVariablePointers.spvasm
@@ -17,7 +17,7 @@
 ;    }
 ;
 ; It passes SPIR-V validation and is indeed valid to the best of my knowledge,
-; but crashes in PatchBufferOp at the time of writing because it's trying to
+; but crashes in LowerBufferOperations at the time of writing because it's trying to
 ; load/store addrspace(7) pointers.
 
 ; SPIR-V
diff --git a/llpc/test/shaderdb/core/ObjNonUniform_TestTexutreLoadStoreInt64.spvasm b/llpc/test/shaderdb/core/ObjNonUniform_TestTexutreLoadStoreInt64.spvasm
index 486e539c07..a1cd3df710 100644
--- a/llpc/test/shaderdb/core/ObjNonUniform_TestTexutreLoadStoreInt64.spvasm
+++ b/llpc/test/shaderdb/core/ObjNonUniform_TestTexutreLoadStoreInt64.spvasm
@@ -106,7 +106,7 @@
 ; SHADERTEST-NEXT:    [[TMP12:%.*]] = load i64, ptr addrspace(5) [[_12]], align 8
 ; SHADERTEST-NEXT:    call void @spirv.NonUniform.i64(i64 [[TMP12]])
 ; SHADERTEST-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i32
-; SHADERTEST-NEXT:    [[TMP14:%.*]] = getelementptr [4294967295 x i8], ptr null, i32 0, i32 [[TMP13]]
+; SHADERTEST-NEXT:    [[TMP14:%.*]] = getelementptr [0 x i8], ptr null, i32 0, i32 [[TMP13]]
 ; SHADERTEST-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[TMP14]] to i32
 ; SHADERTEST-NEXT:    [[TMP16:%.*]] = extractvalue { ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 } [[_11]], 0
 ; SHADERTEST-NEXT:    [[TMP17:%.*]] = extractvalue { ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 } [[_11]], 1
diff --git a/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.elf b/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.elf
deleted file mode 100644
index 6814fa96dd2f0cc0fd0486f53fbf88614ace8f6e..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 10616
zcmds7-ESOM6~D7<vW}a^X%dpqB099tfJjF7&a8LWb|mcB-iWA0rKJcpM8kMyyq;!1
zMmw|XO$FN|C@MyIX=&k+QVL4zrU?a6BqXOwpFlJb4~Q3pD)A5ap-=E&&bjlkyR&N(
z&khkV((IX!JNGx|<NoeBcjtwL$BQXJm?i!ukdKHEoOH?G9`#NX5HtL6hGYh|^xFNe
z{0iw;*2%*c)>qR%S{Kr<u75xM<MlCN-^cL&|2ULNwGXF8+M6lT&VaV7sXgu0)Lz*4
z!M-2%gKZ)Gz4g`9p>`@g3g<^)e-QRhw*@lF`w-H*;QTYNi?Bb^e&PKqH#3`uUfkT=
z9M4=Bxw)G9Tsw0pb9vmqHV)SW^3$86Y<`365@tr(lN%%>%#0rzzx)Y){hJ$P6t91K
zgX~6WY>+XO%NyiAlqWXG9+1k+21yGuVPE&B;ojFb$lmnKYU;7J*Vnf;$T(hyzV@Mn
zz8*jceeFjHeH{RaeF^EAF=5ZgGhmn2m#^G>F*Vw@L2iP52-!C~k{<JQ+Y{=R*^{}9
zeT<|8lDVIp-uuw#Y4QL$J@PQf10X*O@{=GR1$h|c=Rtl7<WZ0rkm=NE#31B5AAvtE
z5ONWYfAW8|t6Ie<*$q{9)QV<Rr^Zg6q$-^n+YRTnrn;!eq9fB|qQk@lwrTO?!eo{z
zlp#RLbZuQ-%w<LA44q<U69OnuR;0971NoRZnLziV$Ti4Or7+nGz!EjlL|~$h50qnK
zAvq25pg{pODAzPYHCAeMqs4OIhwK!aJPW#<k?EAo;E1;94q>OoVwO?L=&V?rrF<LP
z-3-D>H6f&$Xf9daVaI*Vr^WmNm8r~feE{HaM^^ft*x>~ZkOhEz9{`LA+RB4=`5pGz
z)0_iPDi@0)c7=N%QZA9$<w$c>#=wm@BcB`qxMKi7Au#}mt3d#h$pJt(4bXsryN4$b
z^Z|o#ijW5k?PTbcvr|Fw$Rpxp2170(4RQlblszUkhaS<O0SxH^Xdw56DNBUJxep2h
zHDKU~giJI$7yxXP5P(0DdkEZ{M-`90r%>*kypf%dz`+MV;ad|pn5sE^O2EAhvnl2S
z(RiIm%vOU*i6z3TsDV6~lvpCX$^m>*a?};ga`mb!mP_X<AiqGr*fX8PIf0C(SeTom
z`6A{6TGjBAKAb^81KL{hwm5zBXQstMp5=-~m?>$|+Ym{w*lO8TEzB?xCwBjx2T>K8
zg~;unL|Gzp5O+pM;JRa$P2KhE1uCE#LuNH#s|gC$LJgwMU@#(v_D1LoNI14|2FK|b
zHSSU93?5f{S!*~Zy#Mf|5xfur%tAtd<Y85`lBsHLtzzk_W*A}<Mt2kI7R+d&v@j}~
zb8ZUZ4XZNs^w-p<o_=ah{mK)kRC!)(RBQE$SSstH21#(Z0T0q_R$;UtW-*`5VO7zC
z<W#Jx*aWu^357_I^M^=~2=XD)9bo7dhyM(dIP(B2LV)#pWg=91*n`k36MeSQT4EE1
zVG_0~bZiB~J%R)rCN?1@91;lULGXD8X|fNgR)<*-qAU@lu1JHjDWX%0LqQHVBox=F
z%X%mKken6G2Cabphl4^SWv*krxjrPA0X(oX%2XDzl|gvQU5r;mf<*--7^ey%!31-!
zDF^dMPX{Jr86AUkxGG^FD0IHhXbny|46|)cZ*idzX`%~h0g<9PSR!>5pEUiNq01}x
zfN~lYq#WeT1Z)+CvO@XtQqM%dDF-vk;eaBP1B0i}|C|TH87Hpyd6G|a9se_c8b~=I
zVaJBTNXvzkqcBL9BS=2wgp?f@3XwXLqcBbcJ$~v?4x5rg%8B6J1?TW_8d46_FQ`*b
zF;)mt56nS2i)q=N=DC2BrE^`YX<3dG6ar0N1{8)b1NhP=6aq>y4-114A^^4-YMIMR
zDv$`Eu9OQrx$V^;fkLKB129oU1DMWeD~nMMTkIT%YXJE_EY}5n^&uhKWMd-)4|1eg
zsEo51SAnhm^JO2-`^lAkbnaf4eK5gvC(S`s#)ev%@~Sczg%G$sbm9>grb<v|Oz?%5
z9hDm$Z->MM;mKf48{FQ+LPo9XYD1B|RddWVk@~x8l%R*ZmD6Gt*W~BD5;+QhSp`om
zSnQ>94p3kx>B3eTpcX^?mEmdN%~$;<LJhbU@zV=m1IQ`2(_n~oJYNGoJ8h#ud@p+%
z$ZUHJ*f2bZQN$*2*4svd_;ZG@0jIxy0KejDviOS$z7Kez-475(QT#adHJJD}K8Vjt
zz6QJ~-A04>aq4T3-(G{*M1j*a=b|lqfVD0k$#+X_{p~n_9GyhHlqvCqW3jQcWY$&3
zt<_EcA5{hB1^62k=7rkQ66S%_Tb1vB)2*&diwiUll{4flxFC-k<^yKcuuGM#9*T7x
z*KG<sJsKN<j~x!fejn_!Ps|hiL_n_|KMvB?%EynFmRbxZDfmQ!5F<^9=(Rib<THd^
zy5Y5wY|A#wR@HP~T75&(Z0i@_Xq3-s)psSmR<Rqdsn*Ss<+x_uxx6O1CDpN>H$krt
z-hva$P+M?(%RfHfb^JUYUzc3Vuu3X+rNS~exgjlUbz?;X%R4Tt1UeUgCh5-K$ieH9
zzHI8}oJK`$YUPG``%AxBII$)f=3=7+|419=t+yq!>ekPzcFn4~*CU@pT$Rk0ZUZD)
zla}DGa+g2z$M@foEYmc-4&Ic?hQnW<ye(DCiVFXwn|0IpU`=xLy5{Q3_{zTyPoPN+
z@K9gq)hXF^(4$dxkN78Lt75s&y(hWLbyG7+^;*MLVa?L4I##Xf{7?{tYm!^nY}Kxt
z4!rxVcO_U^t*eISYG_pWj>G;d5pVAUuMpujeCV_J-}JNQy)EkPq}QZosQ*F#gzNvI
zK(0$GTGL#r*QzczmiVT$QnEdpzbApS%z9NTt3jNp%bK(N@||~naQ^7@tH1xto$1$}
zJ@EJE+TvBoS=PWI&c&-i=v}&j0jYY|{pTM1^LI}(z0f4T?1T~+(zdFlmoS!YNRDll
z%PM-sS+13h4>vb|F1hDz^YuBn+3^5>{nT}-RVu5x2GOTl)uq~{Ur9_ZsEd~CU-aJg
z+CQ&|5B=fU$vdw*xBoPI_L1LRlgefICZ-AR3*LSsKOtq`@duQD?PVBn+iW-{_mfoE
z;zsZ>o#dJ=mqg;5NHD%(+)9S!XikMl)tYNcr@s2kaTrGWIq#fEj$3!NMUdw!D3jPu
zKqrGVS@ys4dprUXs1aPobKG|Y7Xs-wywEX$cPG*T@>}E*Z+F3ese_{TSnwNd`xxAa
zUc<JpJQ_UFE2C|5a2tIU-Gb*|ix>wvxqF+3lc<V#zu?=C`i&76_fh}+h%f&I+o}RA

diff --git a/llpc/test/shaderdb/core/OpAtomicXXX_TestSharedVariable_lit.comp b/llpc/test/shaderdb/core/OpAtomicXXX_TestSharedVariable_lit.comp
index 72027ec915..c8d7ebdcb7 100644
--- a/llpc/test/shaderdb/core/OpAtomicXXX_TestSharedVariable_lit.comp
+++ b/llpc/test/shaderdb/core/OpAtomicXXX_TestSharedVariable_lit.comp
@@ -65,20 +65,20 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: @{{.*}} = addrspace(3) global i32
 ; SHADERTEST: @{{.*}} = addrspace(3) global [4 x i32]
-; SHADERTEST: atomicrmw add ptr addrspace(3) @{{.*}}, i32 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw add ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 4), i32 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw min ptr addrspace(3) @{{.*}}, i32 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw umin ptr addrspace(3) @{{.*}}, i32 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw max ptr addrspace(3) @{{.*}}, i32 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw umax ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 8), i32 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw and ptr addrspace(3) @{{.*}}, i32 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw and ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 12), i32 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw or ptr addrspace(3) @{{.*}}, i32 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw or ptr addrspace(3) %{{[0-9]*}}, i32 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw xor ptr addrspace(3) @{{.*}}, i32 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw xor ptr addrspace(3) %{{[0-9]*}}, i32 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw xchg ptr addrspace(3) @{{.*}}, i32 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw xchg ptr addrspace(3) %{{[0-9]*}}, i32 %{{[0-9]*}} monotonic
+; SHADERTEST: atomicrmw add ptr addrspace(3) @{{.*}}, i32 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw add ptr addrspace(3) getelementptr inbounds {{(nuw )?}}(i8, ptr addrspace(3) @{{.*}}, i32 4), i32 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw min ptr addrspace(3) @{{.*}}, i32 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw umin ptr addrspace(3) @{{.*}}, i32 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw max ptr addrspace(3) @{{.*}}, i32 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw umax ptr addrspace(3) getelementptr inbounds {{(nuw )?}}(i8, ptr addrspace(3) @{{.*}}, i32 8), i32 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw and ptr addrspace(3) @{{.*}}, i32 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw and ptr addrspace(3) getelementptr inbounds {{(nuw )?}}(i8, ptr addrspace(3) @{{.*}}, i32 12), i32 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw or ptr addrspace(3) @{{.*}}, i32 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw or ptr addrspace(3) %{{[0-9]*}}, i32 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw xor ptr addrspace(3) @{{.*}}, i32 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw xor ptr addrspace(3) %{{[0-9]*}}, i32 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw xchg ptr addrspace(3) @{{.*}}, i32 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw xchg ptr addrspace(3) %{{[0-9]*}}, i32 %{{[0-9]*}} syncscope("agent") monotonic
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpAtomicXXX_TestStorageBlockAndSharedWithData64_lit.comp b/llpc/test/shaderdb/core/OpAtomicXXX_TestStorageBlockAndSharedWithData64_lit.comp
index aa5a3c0651..6fb46ef270 100644
--- a/llpc/test/shaderdb/core/OpAtomicXXX_TestStorageBlockAndSharedWithData64_lit.comp
+++ b/llpc/test/shaderdb/core/OpAtomicXXX_TestStorageBlockAndSharedWithData64_lit.comp
@@ -81,30 +81,30 @@ void main ()
 /*
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
-; SHADERTEST: atomicrmw umin ptr addrspace({{.*}}) %{{[0-9]*}}, i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw umax ptr addrspace({{.*}}) %{{[0-9]*}}, i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw and ptr addrspace({{.*}}) %{{[0-9]*}}, i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw or ptr addrspace({{.*}}) %{{[0-9]*}}, i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw xor ptr addrspace({{.*}}) %{{[0-9]*}}, i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw min ptr addrspace({{.*}}) %{{[0-9]*}}, i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw max ptr addrspace({{.*}}) %{{[0-9]*}}, i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw and ptr addrspace({{.*}}) %{{[0-9]*}}, i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw or ptr addrspace({{.*}}) %{{[0-9]*}}, i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw xor ptr addrspace({{.*}}) %{{[0-9]*}}, i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw add ptr addrspace({{.*}}) %{{[0-9]*}}, i64 %{{[0-9]*}} monotonic
-; SHADERTEST: cmpxchg ptr addrspace({{.*}}) %{{[0-9]*}}, i64 78187493520, i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw umin ptr addrspace(3) @{{.*}}, i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw umax ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 8), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw and ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 16), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw or ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 24), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw xor ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 32), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw min ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 40), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw max ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 48), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw and ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 56), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw or ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 64), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw xor ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 72), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw add ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 80), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: cmpxchg ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 96), i64 78187493520, i64 %{{[0-9]*}} monotonic
+; SHADERTEST: atomicrmw umin ptr addrspace({{.*}}) %{{[0-9]*}}, i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw umax ptr addrspace({{.*}}) %{{[0-9]*}}, i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw and ptr addrspace({{.*}}) %{{[0-9]*}}, i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw or ptr addrspace({{.*}}) %{{[0-9]*}}, i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw xor ptr addrspace({{.*}}) %{{[0-9]*}}, i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw min ptr addrspace({{.*}}) %{{[0-9]*}}, i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw max ptr addrspace({{.*}}) %{{[0-9]*}}, i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw and ptr addrspace({{.*}}) %{{[0-9]*}}, i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw or ptr addrspace({{.*}}) %{{[0-9]*}}, i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw xor ptr addrspace({{.*}}) %{{[0-9]*}}, i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw add ptr addrspace({{.*}}) %{{[0-9]*}}, i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: cmpxchg ptr addrspace({{.*}}) %{{[0-9]*}}, i64 78187493520, i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw umin ptr addrspace(3) @{{.*}}, i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw umax ptr addrspace(3) getelementptr inbounds {{(nuw )?}}(i8, ptr addrspace(3) @{{.*}}, i32 8), i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw and ptr addrspace(3) getelementptr inbounds {{(nuw )?}}(i8, ptr addrspace(3) @{{.*}}, i32 16), i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw or ptr addrspace(3) getelementptr inbounds {{(nuw )?}}(i8, ptr addrspace(3) @{{.*}}, i32 24), i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw xor ptr addrspace(3) getelementptr inbounds {{(nuw )?}}(i8, ptr addrspace(3) @{{.*}}, i32 32), i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw min ptr addrspace(3) getelementptr inbounds {{(nuw )?}}(i8, ptr addrspace(3) @{{.*}}, i32 40), i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw max ptr addrspace(3) getelementptr inbounds {{(nuw )?}}(i8, ptr addrspace(3) @{{.*}}, i32 48), i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw and ptr addrspace(3) getelementptr inbounds {{(nuw )?}}(i8, ptr addrspace(3) @{{.*}}, i32 56), i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw or ptr addrspace(3) getelementptr inbounds {{(nuw )?}}(i8, ptr addrspace(3) @{{.*}}, i32 64), i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw xor ptr addrspace(3) getelementptr inbounds {{(nuw )?}}(i8, ptr addrspace(3) @{{.*}}, i32 72), i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw add ptr addrspace(3) getelementptr inbounds {{(nuw )?}}(i8, ptr addrspace(3) @{{.*}}, i32 80), i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: cmpxchg ptr addrspace(3) getelementptr inbounds {{(nuw )?}}(i8, ptr addrspace(3) @{{.*}}, i32 96), i64 78187493520, i64 %{{[0-9]*}} syncscope("agent") monotonic
 
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: call i64 @llvm.amdgcn.raw.buffer.atomic.umin.i64{{(\.v4i32)?}}(i64 %{{[0-9]*}}, <4 x i32> %{{[0-9]*}}, i32 0, i32 0, i32 0)
@@ -118,18 +118,18 @@ void main ()
 ; SHADERTEST: call i64 @llvm.amdgcn.raw.buffer.atomic.or.i64{{(\.v4i32)?}}(i64 %{{[0-9]*}}, <4 x i32> %{{[0-9]*}}, i32 64, i32 0, i32 0)
 ; SHADERTEST: call i64 @llvm.amdgcn.raw.buffer.atomic.xor.i64{{(\.v4i32)?}}(i64 %{{[0-9]*}}, <4 x i32> %{{[0-9]*}}, i32 72, i32 0, i32 0)
 ; SHADERTEST: call i64 @llvm.amdgcn.raw.buffer.atomic.add.i64{{(\.v4i32)?}}(i64 %{{[0-9]*}}, <4 x i32> %{{[0-9]*}}, i32 80, i32 0, i32 0)
-; SHADERTEST: atomicrmw umin ptr addrspace(3) @{{.*}} i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw umax ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 8), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw and ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 16), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw or ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 24), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw xor ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 32), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw min ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 40), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw max ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 48), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw and ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 56), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw or ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 64), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw xor ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 72), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw add ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 80), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: cmpxchg ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 96), i64 78187493520, i64 %{{[0-9]*}} monotonic
+; SHADERTEST: atomicrmw umin ptr addrspace(3) @{{.*}} i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw umax ptr addrspace(3) getelementptr inbounds {{(nuw )?}}(i8, ptr addrspace(3) @{{.*}}, i32 8), i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw and ptr addrspace(3) getelementptr inbounds {{(nuw )?}}(i8, ptr addrspace(3) @{{.*}}, i32 16), i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw or ptr addrspace(3) getelementptr inbounds {{(nuw )?}}(i8, ptr addrspace(3) @{{.*}}, i32 24), i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw xor ptr addrspace(3) getelementptr inbounds {{(nuw )?}}(i8, ptr addrspace(3) @{{.*}}, i32 32), i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw min ptr addrspace(3) getelementptr inbounds {{(nuw )?}}(i8, ptr addrspace(3) @{{.*}}, i32 40), i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw max ptr addrspace(3) getelementptr inbounds {{(nuw )?}}(i8, ptr addrspace(3) @{{.*}}, i32 48), i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw and ptr addrspace(3) getelementptr inbounds {{(nuw )?}}(i8, ptr addrspace(3) @{{.*}}, i32 56), i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw or ptr addrspace(3) getelementptr inbounds {{(nuw )?}}(i8, ptr addrspace(3) @{{.*}}, i32 64), i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw xor ptr addrspace(3) getelementptr inbounds {{(nuw )?}}(i8, ptr addrspace(3) @{{.*}}, i32 72), i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: atomicrmw add ptr addrspace(3) getelementptr inbounds {{(nuw )?}}(i8, ptr addrspace(3) @{{.*}}, i32 80), i64 %{{[0-9]*}} syncscope("agent") monotonic
+; SHADERTEST: cmpxchg ptr addrspace(3) getelementptr inbounds {{(nuw )?}}(i8, ptr addrspace(3) @{{.*}}, i32 96), i64 78187493520, i64 %{{[0-9]*}} syncscope("agent") monotonic
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpFNegate_TestDvec3_lit.frag b/llpc/test/shaderdb/core/OpFNegate_TestDvec3_lit.frag
index 02bc666dd0..6c3c669ecf 100644
--- a/llpc/test/shaderdb/core/OpFNegate_TestDvec3_lit.frag
+++ b/llpc/test/shaderdb/core/OpFNegate_TestDvec3_lit.frag
@@ -19,7 +19,7 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
 ; SHADERTEST: {{fsub|fneg}} reassoc nnan nsz arcp contract <3 x double> {{(<double -0.000000e\+00, double -0.000000e\+00, double -0.000000e\+00>, )?}}%
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
-; SHADERTEST-COUNT-3: {{fsub|fneg}} reassoc nnan nsz arcp contract float
+; SHADERTEST-COUNT-3: {{fsub|fneg}} {{(reassoc nnan nsz arcp contract )?}}float
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/core/OpImageRead_TestInt64ImageLoad.spvasm b/llpc/test/shaderdb/core/OpImageRead_TestInt64ImageLoad.spvasm
index 9e532f1912..68ed3ebcbd 100644
--- a/llpc/test/shaderdb/core/OpImageRead_TestInt64ImageLoad.spvasm
+++ b/llpc/test/shaderdb/core/OpImageRead_TestInt64ImageLoad.spvasm
@@ -3,7 +3,7 @@
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: call <4 x i64> (...) @lgc.create.image.load.v4i64
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call <2 x i32> @llvm.amdgcn.image.load.2d.v2i32.i16{{(\.v8i32)?}}(i32 3, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0), !invariant.load !10
+; SHADERTEST: call <2 x i32> @llvm.amdgcn.image.load.2d.v2i32.i16{{(\.v8i32)?}}(i32 3, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0), !invariant.load !{{.*}}
 ; SHADERTEST: AMDLLPC SUCCESS
 
 ; SPIR-V
diff --git a/llpc/test/shaderdb/core/OpImageSparseRead_TestInt64SparseImageLoad.spvasm b/llpc/test/shaderdb/core/OpImageSparseRead_TestInt64SparseImageLoad.spvasm
index f500946c55..288794b30e 100644
--- a/llpc/test/shaderdb/core/OpImageSparseRead_TestInt64SparseImageLoad.spvasm
+++ b/llpc/test/shaderdb/core/OpImageSparseRead_TestInt64SparseImageLoad.spvasm
@@ -3,7 +3,7 @@
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: call { <4 x i64>, i32 } (...) @"lgc.create.image.load.s[v4i64,i32]"
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call { <2 x i32>, i32 } @llvm.amdgcn.image.load.2d.sl_v2i32i32s.i16{{(\.v8i32)?}}(i32 3, i16 3, i16 3, <8 x i32> %{{.*}}, i32 1, i32 0), !invariant.load !10
+; SHADERTEST: call { <2 x i32>, i32 } @llvm.amdgcn.image.load.2d.sl_v2i32i32s.i16{{(\.v8i32)?}}(i32 3, i16 3, i16 3, <8 x i32> %{{.*}}, i32 1, i32 0), !invariant.load !{{.*}}
 ; SHADERTEST: AMDLLPC SUCCESS
 
 ; SPIR-V
diff --git a/llpc/test/shaderdb/core/TestXfbStateMetadata.vert b/llpc/test/shaderdb/core/TestXfbStateMetadata.vert
index 26a9197000..36cd971cb4 100644
--- a/llpc/test/shaderdb/core/TestXfbStateMetadata.vert
+++ b/llpc/test/shaderdb/core/TestXfbStateMetadata.vert
@@ -18,7 +18,7 @@ void main()
     gl_PointSize = pointSize;
 }
 // CHECK-LABEL: define {{[^@]+}}@lgc.shader.VS.main
-// CHECK-SAME: () local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !10 !lgc.shaderstage !1 !lgc.xfb.state !11 {
+// CHECK-SAME: () local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !11 !lgc.shaderstage !1 !lgc.xfb.state !12 {
 // CHECK-NEXT:  .entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = call float @lgc.load.vertex.input__f32(i1 false, i32 1, i32 0, i32 0, i32 poison, i32 poison, i32 poison)
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @lgc.load.vertex.input__v4f32(i1 false, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison)
diff --git a/llpc/test/shaderdb/debug_info/PipelineGsTess_TestVsTesGsMergeShader.pipe b/llpc/test/shaderdb/debug_info/PipelineGsTess_TestVsTesGsMergeShader.pipe
index 0fd9d59bef..b540ae4b1d 100644
--- a/llpc/test/shaderdb/debug_info/PipelineGsTess_TestVsTesGsMergeShader.pipe
+++ b/llpc/test/shaderdb/debug_info/PipelineGsTess_TestVsTesGsMergeShader.pipe
@@ -648,9 +648,10 @@ attribute[2].location = 2
 attribute[2].binding = 0
 attribute[2].format = VK_FORMAT_R32G32_SFLOAT
 attribute[2].offset = 0
+
 ;.
 ; SHADERTEST: @[[LDS_GS:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(3) global [1250 x i32], align 4
-; SHADERTEST: @[[LDS_HS:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(3) global [1152 x i32], align 4
+; SHADERTEST: @[[LDS_HS:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(3) global [2240 x i32], align 4
 ;.
 ; SHADERTEST-LABEL: @_amdgpu_hs_main(
 ; SHADERTEST-NEXT:  .entry:
@@ -666,7 +667,7 @@ attribute[2].offset = 0
 ; SHADERTEST-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP5]], i64 1
 ; SHADERTEST-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP5]], i64 2
 ; SHADERTEST-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP5]], i64 3
-; SHADERTEST-NEXT:    call amdgpu_ls void @_amdgpu_ls_main(i32 [[TMP13]], i32 [[TMP14]], i32 [[TMP15]], i32 [[TMP16]], i32 [[TMP8:%.*]], i32 [[TMP9:%.*]], i32 poison, i32 [[TMP11:%.*]]), !dbg [[DBG100:![0-9]+]]
+; SHADERTEST-NEXT:    call amdgpu_ls void @_amdgpu_ls_main(i32 [[TMP13]], i32 [[TMP14]], i32 [[TMP15]], i32 [[TMP16]], i32 [[TMP8:%.*]], i32 [[TMP9:%.*]], i32 poison, i32 [[TMP11:%.*]]), !dbg [[DBG101:![0-9]+]]
 ; SHADERTEST-NEXT:    br label [[DOTENDLS]]
 ; SHADERTEST:       .endLs:
 ; SHADERTEST-NEXT:    fence syncscope("workgroup") release
@@ -679,7 +680,7 @@ attribute[2].offset = 0
 ; SHADERTEST-NEXT:    [[TMP18:%.*]] = extractelement <4 x i32> [[TMP5]], i64 1
 ; SHADERTEST-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP5]], i64 2
 ; SHADERTEST-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[TMP5]], i64 3
-; SHADERTEST-NEXT:    call amdgpu_hs void @_amdgpu_hs_main.1(i32 [[TMP17]], i32 [[TMP18]], i32 [[TMP19]], i32 [[TMP20]], i32 [[OFFCHIPLDSBASE:%.*]], i32 [[TFBUFFERBASE:%.*]], i32 [[TMP6:%.*]], i32 [[TMP7:%.*]]), !dbg [[DBG100]]
+; SHADERTEST-NEXT:    call amdgpu_hs void @_amdgpu_hs_main.1(i32 [[TMP17]], i32 [[TMP18]], i32 [[TMP19]], i32 [[TMP20]], i32 [[OFFCHIPLDSBASE:%.*]], i32 [[TFBUFFERBASE:%.*]], i32 [[TMP6:%.*]], i32 [[TMP7:%.*]]), !dbg [[DBG101]]
 ; SHADERTEST-NEXT:    br label [[DOTENDHS]]
 ; SHADERTEST:       .endHs:
 ; SHADERTEST-NEXT:    ret void
@@ -699,7 +700,7 @@ attribute[2].offset = 0
 ; SHADERTEST-NEXT:    br i1 [[VALIDESVERT]], label [[DOTBEGINES:%.*]], label [[DOTENDES:%.*]]
 ; SHADERTEST:       .beginEs:
 ; SHADERTEST-NEXT:    [[TMP17:%.*]] = extractelement <1 x i32> [[TMP5:%.*]], i64 0
-; SHADERTEST-NEXT:    call amdgpu_es void @_amdgpu_es_main(i32 [[TMP17]], i32 [[OFFCHIPLDSBASE:%.*]], i32 [[TMP16]], float [[TMP11:%.*]], float [[TMP12:%.*]], i32 [[TMP13:%.*]], i32 [[TMP14:%.*]]), !dbg [[DBG103:![0-9]+]]
+; SHADERTEST-NEXT:    call amdgpu_es void @_amdgpu_es_main(i32 [[TMP17]], i32 [[OFFCHIPLDSBASE:%.*]], i32 [[TMP16]], float [[TMP11:%.*]], float [[TMP12:%.*]], i32 [[TMP13:%.*]], i32 [[TMP14:%.*]]), !dbg [[DBG104:![0-9]+]]
 ; SHADERTEST-NEXT:    br label [[DOTENDES]]
 ; SHADERTEST:       .endEs:
 ; SHADERTEST-NEXT:    fence syncscope("workgroup") release
@@ -715,7 +716,7 @@ attribute[2].offset = 0
 ; SHADERTEST-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 poison, i32 0, i32 16)
 ; SHADERTEST-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 poison, i32 16, i32 16)
 ; SHADERTEST-NEXT:    [[TMP24:%.*]] = extractelement <1 x i32> [[TMP5]], i64 0
-; SHADERTEST-NEXT:    call amdgpu_gs void @_amdgpu_gs_main.2(i32 [[TMP24]], i32 [[GSVSOFFSET:%.*]], i32 [[GSWAVEID]], i32 [[TMP18]], i32 [[TMP19]], i32 [[TMP8:%.*]], i32 [[TMP20]], i32 [[TMP21]], i32 [[TMP22]], i32 [[TMP23]], i32 [[TMP9:%.*]]), !dbg [[DBG103]]
+; SHADERTEST-NEXT:    call amdgpu_gs void @_amdgpu_gs_main.2(i32 [[TMP24]], i32 [[GSVSOFFSET:%.*]], i32 [[GSWAVEID]], i32 [[TMP18]], i32 [[TMP19]], i32 [[TMP8:%.*]], i32 [[TMP20]], i32 [[TMP21]], i32 [[TMP22]], i32 [[TMP23]], i32 [[TMP9:%.*]]), !dbg [[DBG104]]
 ; SHADERTEST-NEXT:    br label [[DOTENDGS]]
 ; SHADERTEST:       .endGs:
 ; SHADERTEST-NEXT:    ret void
@@ -723,83 +724,107 @@ attribute[2].offset = 0
 ;
 ; SHADERTEST-LABEL: @_amdgpu_ls_main(
 ; SHADERTEST-NEXT:  .entry:
-; SHADERTEST-NEXT:    ret void, !dbg [[DBG108:![0-9]+]]
+; SHADERTEST-NEXT:    ret void, !dbg [[DBG109:![0-9]+]]
 ;
 ;
 ; SHADERTEST-LABEL: @_amdgpu_hs_main.1(
 ; SHADERTEST-NEXT:  .entry:
-; SHADERTEST-NEXT:    [[TMP0:%.*]] = and i32 [[RELPATCHID:%.*]], 255
-; SHADERTEST-NEXT:    [[TMP1:%.*]] = call noundef i64 @llvm.amdgcn.s.getpc() #[[ATTR7:[0-9]+]]
-; SHADERTEST-NEXT:    [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32>
-; SHADERTEST-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[GLOBALTABLE:%.*]], i32 0
-; SHADERTEST-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to i64
-; SHADERTEST-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr addrspace(4)
-; SHADERTEST-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP5]], i32 144
-; SHADERTEST-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP6]], align 16
-; SHADERTEST-NEXT:    [[TMP8:%.*]] = call noundef i64 @llvm.amdgcn.s.getpc() #[[ATTR7]]
-; SHADERTEST-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], -4294967296
-; SHADERTEST-NEXT:    [[TMP10:%.*]] = zext i32 [[GLOBALTABLE]] to i64
-; SHADERTEST-NEXT:    [[TMP11:%.*]] = or disjoint i64 [[TMP9]], [[TMP10]]
-; SHADERTEST-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr addrspace(4)
-; SHADERTEST-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP12]], i64 160
-; SHADERTEST-NEXT:    [[TMP14:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP13]], align 16
-; SHADERTEST-NEXT:    [[TMP15:%.*]] = and i32 [[RELPATCHID]], 255
-; SHADERTEST-NEXT:    [[TMP16:%.*]] = lshr i32 [[RELPATCHID]], 8
-; SHADERTEST-NEXT:    [[TMP17:%.*]] = and i32 [[TMP16]], 31
-; SHADERTEST-NEXT:    [[DOTIDX:%.*]] = mul nuw nsw i32 [[TMP15]], 24, !dbg [[DBG111:![0-9]+]]
-; SHADERTEST-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(3) @Lds.HS, i32 [[DOTIDX]], !dbg [[DBG111]]
-; SHADERTEST-NEXT:    store i32 1073741824, ptr addrspace(3) [[TMP18]], align 4, !dbg [[DBG111]]
-; SHADERTEST-NEXT:    [[TMP19:%.*]] = mul nuw nsw i32 [[TMP15]], 6, !dbg [[DBG112:![0-9]+]]
-; SHADERTEST-NEXT:    [[TMP20:%.*]] = or disjoint i32 [[TMP19]], 1, !dbg [[DBG112]]
-; SHADERTEST-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr addrspace(3) @Lds.HS, i32 [[TMP20]], !dbg [[DBG112]]
-; SHADERTEST-NEXT:    store i32 1073741824, ptr addrspace(3) [[TMP21]], align 4, !dbg [[DBG112]]
-; SHADERTEST-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP18]], i32 8, !dbg [[DBG113:![0-9]+]]
-; SHADERTEST-NEXT:    store i32 1073741824, ptr addrspace(3) [[TMP22]], align 4, !dbg [[DBG113]]
-; SHADERTEST-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP18]], i32 16, !dbg [[DBG114:![0-9]+]]
-; SHADERTEST-NEXT:    store i32 1082130432, ptr addrspace(3) [[TMP23]], align 4, !dbg [[DBG114]]
-; SHADERTEST-NEXT:    [[TMP24:%.*]] = mul nuw nsw i32 [[TMP15]], 3, !dbg [[DBG115:![0-9]+]]
-; SHADERTEST-NEXT:    [[TMP25:%.*]] = add nuw nsw i32 [[TMP24]], [[TMP17]], !dbg [[DBG115]]
-; SHADERTEST-NEXT:    [[DOTIDX3:%.*]] = shl nuw nsw i32 [[TMP25]], 4, !dbg [[DBG115]]
-; SHADERTEST-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr addrspace(3) @Lds.HS, i32 [[DOTIDX3]], !dbg [[DBG115]]
-; SHADERTEST-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP26]], i32 1536, !dbg [[DBG115]]
-; SHADERTEST-NEXT:    [[TMP28:%.*]] = load i32, ptr addrspace(3) [[TMP27]], align 4, !dbg [[DBG115]]
-; SHADERTEST-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP26]], i32 1540, !dbg [[DBG115]]
-; SHADERTEST-NEXT:    [[TMP30:%.*]] = load i32, ptr addrspace(3) [[TMP29]], align 4, !dbg [[DBG115]]
-; SHADERTEST-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP26]], i32 1544, !dbg [[DBG115]]
-; SHADERTEST-NEXT:    [[TMP32:%.*]] = load i32, ptr addrspace(3) [[TMP31]], align 4, !dbg [[DBG115]]
-; SHADERTEST-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP26]], i32 1548, !dbg [[DBG115]]
-; SHADERTEST-NEXT:    [[TMP34:%.*]] = load i32, ptr addrspace(3) [[TMP33]], align 4, !dbg [[DBG115]]
-; SHADERTEST-NEXT:    [[TMP35:%.*]] = shl nuw nsw i32 [[TMP17]], 4, !dbg [[DBG115]]
-; SHADERTEST-NEXT:    [[TMP36:%.*]] = mul nuw nsw i32 [[TMP15]], 48, !dbg [[DBG115]]
-; SHADERTEST-NEXT:    [[TMP37:%.*]] = add nuw nsw i32 [[TMP35]], [[TMP36]], !dbg [[DBG115]]
-; SHADERTEST-NEXT:    [[DOTUPTO010:%.*]] = insertelement <4 x i32> poison, i32 [[TMP28]], i64 0, !dbg [[DBG115]]
-; SHADERTEST-NEXT:    [[DOTUPTO111:%.*]] = insertelement <4 x i32> [[DOTUPTO010]], i32 [[TMP30]], i64 1, !dbg [[DBG115]]
-; SHADERTEST-NEXT:    [[DOTUPTO212:%.*]] = insertelement <4 x i32> [[DOTUPTO111]], i32 [[TMP32]], i64 2, !dbg [[DBG115]]
-; SHADERTEST-NEXT:    [[TMP38:%.*]] = insertelement <4 x i32> [[DOTUPTO212]], i32 [[TMP34]], i64 3, !dbg [[DBG115]]
-; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.v4i32{{(\.v4i32)?}}(<4 x i32> [[TMP38]], <4 x i32> [[TMP14]], i32 [[TMP37]], i32 [[OFFCHIPLDSBASE:%.*]], i32 immarg 77, i32 immarg 1) #[[ATTR12:[0-9]+]], !dbg [[DBG115]]
-; SHADERTEST-NEXT:    fence syncscope("workgroup") release, !dbg [[DBG115]]
-; SHADERTEST-NEXT:    call void @llvm.amdgcn.s.barrier(), !dbg [[DBG115]]
-; SHADERTEST-NEXT:    fence syncscope("workgroup") acquire, !dbg [[DBG115]]
-; SHADERTEST-NEXT:    [[TMP39:%.*]] = mul i32 [[TMP0]], 6, !dbg [[DBG115]]
-; SHADERTEST-NEXT:    [[TMP40:%.*]] = getelementptr i32, ptr addrspace(3) @Lds.HS, i32 [[TMP39]], !dbg [[DBG115]]
-; SHADERTEST-NEXT:    [[TMP41:%.*]] = load <3 x float>, ptr addrspace(3) [[TMP40]], align 4, !dbg [[DBG115]]
-; SHADERTEST-NEXT:    [[TMP42:%.*]] = mul i32 [[TMP0]], 6, !dbg [[DBG115]]
-; SHADERTEST-NEXT:    [[TMP43:%.*]] = add i32 [[TMP42]], 4, !dbg [[DBG115]]
-; SHADERTEST-NEXT:    [[TMP44:%.*]] = getelementptr i32, ptr addrspace(3) @Lds.HS, i32 [[TMP43]], !dbg [[DBG115]]
-; SHADERTEST-NEXT:    [[TMP45:%.*]] = load <1 x float>, ptr addrspace(3) [[TMP44]], align 4, !dbg [[DBG115]]
-; SHADERTEST-NEXT:    [[TMP46:%.*]] = mul i32 [[TMP0]], 16, !dbg [[DBG115]]
-; SHADERTEST-NEXT:    [[TMP47:%.*]] = shufflevector <3 x float> [[TMP41]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG115]]
-; SHADERTEST-NEXT:    [[TMP48:%.*]] = extractelement <1 x float> [[TMP45]], i64 0, !dbg [[DBG115]]
-; SHADERTEST-NEXT:    [[TMP49:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP48]], i64 3, !dbg [[DBG115]]
-; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.v4f32{{(\.v4i32)?}}(<4 x float> [[TMP49]], <4 x i32> [[TMP7]], i32 [[TMP46]], i32 [[TFBUFFERBASE:%.*]], i32 77, i32 1), !dbg [[DBG115]]
-; SHADERTEST-NEXT:    ret void, !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[RELPATCHID:%.*]], i32 8, i32 5) #[[ATTR8:[0-9]+]]
+; SHADERTEST-NEXT:    [[TMP1:%.*]] = and i32 [[RELPATCHID]], 255
+; SHADERTEST-NEXT:    [[TMP2:%.*]] = call noundef i64 @llvm.amdgcn.s.getpc() #[[ATTR8]]
+; SHADERTEST-NEXT:    [[TMP3:%.*]] = bitcast i64 [[TMP2]] to <2 x i32>
+; SHADERTEST-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[GLOBALTABLE:%.*]], i32 0
+; SHADERTEST-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to i64
+; SHADERTEST-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr addrspace(4)
+; SHADERTEST-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP6]], i32 160
+; SHADERTEST-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP7]], align 16
+; SHADERTEST-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP6]], i32 144
+; SHADERTEST-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP9]], align 16
+; SHADERTEST-NEXT:    [[TMP11:%.*]] = and i32 [[RELPATCHID]], 255
+; SHADERTEST-NEXT:    [[TMP12:%.*]] = lshr i32 [[RELPATCHID]], 8
+; SHADERTEST-NEXT:    [[TMP13:%.*]] = and i32 [[TMP12]], 31
+; SHADERTEST-NEXT:    [[DOTIDX:%.*]] = mul nuw nsw i32 [[TMP11]], 20, !dbg [[DBG112:![0-9]+]]
+; SHADERTEST-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr addrspace(3) @Lds.HS, i32 [[DOTIDX]], !dbg [[DBG112]]
+; SHADERTEST-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP14]], i32 3840, !dbg [[DBG112]]
+; SHADERTEST-NEXT:    store i32 1073741824, ptr addrspace(3) [[TMP15]], align 4, !dbg [[DBG112]]
+; SHADERTEST-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP14]], i32 3844, !dbg [[DBG113:![0-9]+]]
+; SHADERTEST-NEXT:    store i32 1073741824, ptr addrspace(3) [[TMP16]], align 4, !dbg [[DBG113]]
+; SHADERTEST-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP14]], i32 3848, !dbg [[DBG114:![0-9]+]]
+; SHADERTEST-NEXT:    store i32 1073741824, ptr addrspace(3) [[TMP17]], align 4, !dbg [[DBG114]]
+; SHADERTEST-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP14]], i32 3852, !dbg [[DBG115:![0-9]+]]
+; SHADERTEST-NEXT:    store i32 1082130432, ptr addrspace(3) [[TMP18]], align 4, !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[TMP19:%.*]] = mul nuw nsw i32 [[TMP11]], 3, !dbg [[DBG116:![0-9]+]]
+; SHADERTEST-NEXT:    [[TMP20:%.*]] = add nuw nsw i32 [[TMP19]], [[TMP13]], !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[DOTIDX4:%.*]] = mul nuw nsw i32 [[TMP20]], 20, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr addrspace(3) @Lds.HS, i32 [[DOTIDX4]], !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP21]], i32 5120, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP23:%.*]] = load i32, ptr addrspace(3) [[TMP22]], align 4, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP21]], i32 5124, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP25:%.*]] = load i32, ptr addrspace(3) [[TMP24]], align 4, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP21]], i32 5128, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP27:%.*]] = load i32, ptr addrspace(3) [[TMP26]], align 4, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP21]], i32 5132, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP29:%.*]] = load i32, ptr addrspace(3) [[TMP28]], align 4, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[DOTIDX5:%.*]] = mul nuw nsw i32 [[TMP11]], 60, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr addrspace(3) @Lds.HS, i32 [[DOTIDX5]], !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[DOTIDX6:%.*]] = mul nuw nsw i32 [[TMP13]], 20, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP30]], i32 [[DOTIDX6]], !dbg [[DBG116]]
+; SHADERTEST-NEXT:    store i32 [[TMP23]], ptr addrspace(3) [[TMP31]], align 4, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP31]], i32 4, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    store i32 [[TMP25]], ptr addrspace(3) [[TMP32]], align 4, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP31]], i32 8, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    store i32 [[TMP27]], ptr addrspace(3) [[TMP33]], align 4, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP31]], i32 12, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    store i32 [[TMP29]], ptr addrspace(3) [[TMP34]], align 4, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    fence syncscope("workgroup") release, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.s.barrier(), !dbg [[DBG116]]
+; SHADERTEST-NEXT:    fence syncscope("workgroup") acquire, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP35:%.*]] = mul i32 [[TMP1]], 5, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP36:%.*]] = add i32 [[TMP35]], 960, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP37:%.*]] = getelementptr i32, ptr addrspace(3) @Lds.HS, i32 [[TMP36]], !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP38:%.*]] = load <3 x float>, ptr addrspace(3) [[TMP37]], align 4, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP39:%.*]] = mul i32 [[TMP1]], 5, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP40:%.*]] = add i32 [[TMP39]], 960, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP41:%.*]] = add i32 [[TMP40]], 3, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP42:%.*]] = getelementptr i32, ptr addrspace(3) @Lds.HS, i32 [[TMP41]], !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP43:%.*]] = load <1 x float>, ptr addrspace(3) [[TMP42]], align 4, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP44:%.*]] = mul i32 [[TMP1]], 16, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP45:%.*]] = shufflevector <3 x float> [[TMP38]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP46:%.*]] = extractelement <1 x float> [[TMP43]], i64 0, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP46]], i64 3, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.buffer.store.v4f32{{(.v4i32)?}}(<4 x float> [[TMP47]], <4 x i32> [[TMP10]], i32 [[TMP44]], i32 [[TFBUFFERBASE:%.*]], i32 1), !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP48:%.*]] = extractelement <3 x float> [[TMP38]], i64 0, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP49:%.*]] = extractelement <3 x float> [[TMP38]], i64 1, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP50:%.*]] = call float @llvm.minnum.f32(float [[TMP48]], float [[TMP49]]), !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP51:%.*]] = extractelement <3 x float> [[TMP38]], i64 2, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP52:%.*]] = call float @llvm.minnum.f32(float [[TMP50]], float [[TMP51]]), !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP53:%.*]] = fcmp ogt float [[TMP52]], 0.000000e+00, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    br i1 [[TMP53]], label [[DOTWRITEHSOUTPUTS_THEN:%.*]], label [[DOTWRITEHSOUTPUTS_ENDIF:%.*]], !dbg [[DBG116]]
+; SHADERTEST:       .writeHsOutputs.then:
+; SHADERTEST-NEXT:    [[TMP54:%.*]] = mul i32 [[TMP1]], 15, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP55:%.*]] = mul i32 [[TMP0]], 5, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP56:%.*]] = add i32 [[TMP54]], [[TMP55]], !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP57:%.*]] = add i32 [[TMP56]], 0, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP58:%.*]] = mul i32 [[TMP1]], 12, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP59:%.*]] = mul i32 [[TMP0]], 4, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP60:%.*]] = add i32 [[TMP58]], [[TMP59]], !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP61:%.*]] = add i32 [[TMP60]], 0, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP62:%.*]] = add i32 [[TMP57]], 0, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP63:%.*]] = getelementptr i32, ptr addrspace(3) @Lds.HS, i32 [[TMP62]], !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP64:%.*]] = load <4 x float>, ptr addrspace(3) [[TMP63]], align 4, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP65:%.*]] = add i32 [[TMP61]], 0, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    [[TMP66:%.*]] = mul i32 [[TMP65]], 4, !dbg [[DBG116]]
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.v4f32{{(.v4i32)?}}(<4 x float> [[TMP64]], <4 x i32> [[TMP8]], i32 [[TMP66]], i32 [[OFFCHIPLDSBASE:%.*]], i32 77, i32 1), !dbg [[DBG116]]
+; SHADERTEST-NEXT:    br label [[DOTWRITEHSOUTPUTS_ENDIF]], !dbg [[DBG116]]
+; SHADERTEST:       .writeHsOutputs.endif:
+; SHADERTEST-NEXT:    ret void, !dbg [[DBG116]]
 ;
 ;
 ; SHADERTEST-LABEL: @_amdgpu_es_main(
 ; SHADERTEST-NEXT:  .entry:
 ; SHADERTEST-NEXT:    [[TMP0:%.*]] = fadd float [[TESSCOORDX:%.*]], [[TESSCOORDY:%.*]]
-; SHADERTEST-NEXT:    [[TMP1:%.*]] = fsub float 1.000000e+00, [[TMP0]], !dbg [[DBG121:![0-9]+]]
-; SHADERTEST-NEXT:    [[TMP2:%.*]] = call noundef i64 @llvm.amdgcn.s.getpc() #[[ATTR7]]
+; SHADERTEST-NEXT:    [[TMP1:%.*]] = fsub float 1.000000e+00, [[TMP0]], !dbg [[DBG122:![0-9]+]]
+; SHADERTEST-NEXT:    [[TMP2:%.*]] = call noundef i64 @llvm.amdgcn.s.getpc() #[[ATTR8]]
 ; SHADERTEST-NEXT:    [[TMP3:%.*]] = and i64 [[TMP2]], -4294967296
 ; SHADERTEST-NEXT:    [[TMP4:%.*]] = zext i32 [[GLOBALTABLE:%.*]] to i64
 ; SHADERTEST-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[TMP3]], [[TMP4]]
@@ -808,100 +833,100 @@ attribute[2].offset = 0
 ; SHADERTEST-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP7]], align 16
 ; SHADERTEST-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
 ; SHADERTEST-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TMP9]])
-; SHADERTEST-NEXT:    #dbg_value(float [[TESSCOORDX]], !118, !DIExpression(), !122)
-; SHADERTEST-NEXT:    #dbg_value(float [[TESSCOORDY]], !119, !DIExpression(), !122)
-; SHADERTEST-NEXT:    #dbg_value(float [[TMP1]], !120, !DIExpression(), !122)
-; SHADERTEST-NEXT:    [[TMP11:%.*]] = mul i32 [[RELPATCHID:%.*]], 48, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32{{(\.v4i32)?}}(<4 x i32> [[TMP8]], i32 [[TMP11]], i32 [[OFFCHIPLDSBASE:%.*]], i32 immarg 77, i32 immarg 5) #[[ATTR8:[0-9]+]], !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[TMP12]] to <4 x float>, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[DOTI07:%.*]] = extractelement <4 x float> [[BC]], i64 0, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[BC56:%.*]] = bitcast <4 x i32> [[TMP12]] to <4 x float>, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[DOTI18:%.*]] = extractelement <4 x float> [[BC56]], i64 1, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[BC57:%.*]] = bitcast <4 x i32> [[TMP12]] to <4 x float>, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[DOTI29:%.*]] = extractelement <4 x float> [[BC57]], i64 2, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[BC58:%.*]] = bitcast <4 x i32> [[TMP12]] to <4 x float>, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[DOTI310:%.*]] = extractelement <4 x float> [[BC58]], i64 3, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[SCALE_I0:%.*]] = fmul nnan nsz afn float [[TESSCOORDX]], [[DOTI07]], !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[SCALE_I1:%.*]] = fmul nnan nsz afn float [[TESSCOORDX]], [[DOTI18]], !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[SCALE_I2:%.*]] = fmul nnan nsz afn float [[TESSCOORDX]], [[DOTI29]], !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[SCALE_I3:%.*]] = fmul nnan nsz afn float [[TESSCOORDX]], [[DOTI310]], !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[TMP13:%.*]] = add i32 [[TMP11]], 16, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[TMP14:%.*]] = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32{{(\.v4i32)?}}(<4 x i32> [[TMP8]], i32 [[TMP13]], i32 [[OFFCHIPLDSBASE]], i32 immarg 77, i32 immarg 5) #[[ATTR8]], !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[BC59:%.*]] = bitcast <4 x i32> [[TMP14]] to <4 x float>, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[DOTI012:%.*]] = extractelement <4 x float> [[BC59]], i64 0, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[BC60:%.*]] = bitcast <4 x i32> [[TMP14]] to <4 x float>, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[DOTI114:%.*]] = extractelement <4 x float> [[BC60]], i64 1, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[BC61:%.*]] = bitcast <4 x i32> [[TMP14]] to <4 x float>, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[DOTI216:%.*]] = extractelement <4 x float> [[BC61]], i64 2, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[BC62:%.*]] = bitcast <4 x i32> [[TMP14]] to <4 x float>, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[DOTI318:%.*]] = extractelement <4 x float> [[BC62]], i64 3, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[SCALE2_I0:%.*]] = fmul nnan nsz afn float [[TESSCOORDY]], [[DOTI012]], !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[SCALE2_I1:%.*]] = fmul nnan nsz afn float [[TESSCOORDY]], [[DOTI114]], !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[SCALE2_I2:%.*]] = fmul nnan nsz afn float [[TESSCOORDY]], [[DOTI216]], !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[SCALE2_I3:%.*]] = fmul nnan nsz afn float [[TESSCOORDY]], [[DOTI318]], !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[DOTI019:%.*]] = fadd nnan nsz afn float [[SCALE_I0]], [[SCALE2_I0]], !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[DOTI120:%.*]] = fadd nnan nsz afn float [[SCALE_I1]], [[SCALE2_I1]], !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[DOTI221:%.*]] = fadd nnan nsz afn float [[SCALE_I2]], [[SCALE2_I2]], !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[DOTI322:%.*]] = fadd nnan nsz afn float [[SCALE_I3]], [[SCALE2_I3]], !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[TMP15:%.*]] = add i32 [[TMP11]], 32, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[TMP16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32{{(\.v4i32)?}}(<4 x i32> [[TMP8]], i32 [[TMP15]], i32 [[OFFCHIPLDSBASE]], i32 immarg 77, i32 immarg 5) #[[ATTR8]], !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[BC63:%.*]] = bitcast <4 x i32> [[TMP16]] to <4 x float>, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[DOTI024:%.*]] = extractelement <4 x float> [[BC63]], i64 0, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[BC64:%.*]] = bitcast <4 x i32> [[TMP16]] to <4 x float>, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[DOTI126:%.*]] = extractelement <4 x float> [[BC64]], i64 1, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[BC65:%.*]] = bitcast <4 x i32> [[TMP16]] to <4 x float>, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[DOTI228:%.*]] = extractelement <4 x float> [[BC65]], i64 2, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[BC66:%.*]] = bitcast <4 x i32> [[TMP16]] to <4 x float>, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[DOTI330:%.*]] = extractelement <4 x float> [[BC66]], i64 3, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[SCALE4_I0:%.*]] = fmul nnan nsz afn float [[TMP1]], [[DOTI024]], !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[SCALE4_I1:%.*]] = fmul nnan nsz afn float [[TMP1]], [[DOTI126]], !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[SCALE4_I2:%.*]] = fmul nnan nsz afn float [[TMP1]], [[DOTI228]], !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[SCALE4_I3:%.*]] = fmul nnan nsz afn float [[TMP1]], [[DOTI330]], !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[DOTI031:%.*]] = fadd nnan nsz afn float [[DOTI019]], [[SCALE4_I0]], !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[DOTI132:%.*]] = fadd nnan nsz afn float [[DOTI120]], [[SCALE4_I1]], !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[DOTI233:%.*]] = fadd nnan nsz afn float [[DOTI221]], [[SCALE4_I2]], !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[DOTI334:%.*]] = fadd nnan nsz afn float [[DOTI322]], [[SCALE4_I3]], !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[DOTIDX:%.*]] = mul i32 [[TMP10]], 20, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr addrspace(3) @Lds.GS, i32 [[DOTIDX]], !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr addrspace(3) [[TMP17]], i32 [[ESGSOFFSET:%.*]], !dbg [[DBG121]]
-; SHADERTEST-NEXT:    store float [[DOTI031]], ptr addrspace(3) [[TMP18]], align 4, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP18]], i32 4, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    store float [[DOTI132]], ptr addrspace(3) [[TMP19]], align 4, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP18]], i32 8, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    store float [[DOTI233]], ptr addrspace(3) [[TMP20]], align 4, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP18]], i32 12, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    store float [[DOTI334]], ptr addrspace(3) [[TMP21]], align 4, !dbg [[DBG121]]
-; SHADERTEST-NEXT:    ret void, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    #dbg_value(float [[TESSCOORDX]], !119, !DIExpression(), !123)
+; SHADERTEST-NEXT:    #dbg_value(float [[TESSCOORDY]], !120, !DIExpression(), !123)
+; SHADERTEST-NEXT:    #dbg_value(float [[TMP1]], !121, !DIExpression(), !123)
+; SHADERTEST-NEXT:    [[TMP11:%.*]] = mul i32 [[RELPATCHID:%.*]], 48, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> [[TMP8]], i32 [[TMP11]], i32 [[OFFCHIPLDSBASE:%.*]], i32 immarg 77, i32 immarg 5) #[[ATTR9:[0-9]+]], !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[TMP12]] to <4 x float>, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[DOTI07:%.*]] = extractelement <4 x float> [[BC]], i64 0, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[BC56:%.*]] = bitcast <4 x i32> [[TMP12]] to <4 x float>, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[DOTI18:%.*]] = extractelement <4 x float> [[BC56]], i64 1, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[BC57:%.*]] = bitcast <4 x i32> [[TMP12]] to <4 x float>, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[DOTI29:%.*]] = extractelement <4 x float> [[BC57]], i64 2, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[BC58:%.*]] = bitcast <4 x i32> [[TMP12]] to <4 x float>, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[DOTI310:%.*]] = extractelement <4 x float> [[BC58]], i64 3, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[SCALE_I0:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[TESSCOORDX]], [[DOTI07]], !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[SCALE_I1:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[TESSCOORDX]], [[DOTI18]], !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[SCALE_I2:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[TESSCOORDX]], [[DOTI29]], !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[SCALE_I3:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[TESSCOORDX]], [[DOTI310]], !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[TMP13:%.*]] = add i32 [[TMP11]], 16, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[TMP14:%.*]] = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> [[TMP8]], i32 [[TMP13]], i32 [[OFFCHIPLDSBASE]], i32 immarg 77, i32 immarg 5) #[[ATTR9]], !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[BC59:%.*]] = bitcast <4 x i32> [[TMP14]] to <4 x float>, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[DOTI012:%.*]] = extractelement <4 x float> [[BC59]], i64 0, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[BC60:%.*]] = bitcast <4 x i32> [[TMP14]] to <4 x float>, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[DOTI114:%.*]] = extractelement <4 x float> [[BC60]], i64 1, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[BC61:%.*]] = bitcast <4 x i32> [[TMP14]] to <4 x float>, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[DOTI216:%.*]] = extractelement <4 x float> [[BC61]], i64 2, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[BC62:%.*]] = bitcast <4 x i32> [[TMP14]] to <4 x float>, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[DOTI318:%.*]] = extractelement <4 x float> [[BC62]], i64 3, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[SCALE2_I0:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[TESSCOORDY]], [[DOTI012]], !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[SCALE2_I1:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[TESSCOORDY]], [[DOTI114]], !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[SCALE2_I2:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[TESSCOORDY]], [[DOTI216]], !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[SCALE2_I3:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[TESSCOORDY]], [[DOTI318]], !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[DOTI019:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[SCALE2_I0]], [[SCALE_I0]], !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[DOTI120:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[SCALE2_I1]], [[SCALE_I1]], !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[DOTI221:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[SCALE2_I2]], [[SCALE_I2]], !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[DOTI322:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[SCALE2_I3]], [[SCALE_I3]], !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[TMP15:%.*]] = add i32 [[TMP11]], 32, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[TMP16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> [[TMP8]], i32 [[TMP15]], i32 [[OFFCHIPLDSBASE]], i32 immarg 77, i32 immarg 5) #[[ATTR9]], !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[BC63:%.*]] = bitcast <4 x i32> [[TMP16]] to <4 x float>, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[DOTI024:%.*]] = extractelement <4 x float> [[BC63]], i64 0, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[BC64:%.*]] = bitcast <4 x i32> [[TMP16]] to <4 x float>, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[DOTI126:%.*]] = extractelement <4 x float> [[BC64]], i64 1, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[BC65:%.*]] = bitcast <4 x i32> [[TMP16]] to <4 x float>, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[DOTI228:%.*]] = extractelement <4 x float> [[BC65]], i64 2, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[BC66:%.*]] = bitcast <4 x i32> [[TMP16]] to <4 x float>, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[DOTI330:%.*]] = extractelement <4 x float> [[BC66]], i64 3, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[SCALE4_I0:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[TMP1]], [[DOTI024]], !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[SCALE4_I1:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[TMP1]], [[DOTI126]], !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[SCALE4_I2:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[TMP1]], [[DOTI228]], !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[SCALE4_I3:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[TMP1]], [[DOTI330]], !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[DOTI031:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[DOTI019]], [[SCALE4_I0]], !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[DOTI132:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[DOTI120]], [[SCALE4_I1]], !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[DOTI233:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[DOTI221]], [[SCALE4_I2]], !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[DOTI334:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[DOTI322]], [[SCALE4_I3]], !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[DOTIDX:%.*]] = mul i32 [[TMP10]], 20, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr addrspace(3) @Lds.GS, i32 [[DOTIDX]], !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr addrspace(3) [[TMP17]], i32 [[ESGSOFFSET:%.*]], !dbg [[DBG122]]
+; SHADERTEST-NEXT:    store float [[DOTI031]], ptr addrspace(3) [[TMP18]], align 4, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP18]], i32 4, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    store float [[DOTI132]], ptr addrspace(3) [[TMP19]], align 4, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP18]], i32 8, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    store float [[DOTI233]], ptr addrspace(3) [[TMP20]], align 4, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP18]], i32 12, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    store float [[DOTI334]], ptr addrspace(3) [[TMP21]], align 4, !dbg [[DBG122]]
+; SHADERTEST-NEXT:    ret void, !dbg [[DBG122]]
 ;
 ;
 ; SHADERTEST-LABEL: @_amdgpu_gs_main.2(
 ; SHADERTEST-NEXT:  .entry:
-; SHADERTEST-NEXT:    #dbg_value(i32 0, !125, !DIExpression(), !128)
-; SHADERTEST-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr addrspace(3) @Lds.GS, i32 [[ESGSOFFSET2:%.*]], !dbg [[DBG129:![0-9]+]]
-; SHADERTEST-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP0]], i32 12, !dbg [[DBG129]]
-; SHADERTEST-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) [[TMP1]], align 4, !dbg [[DBG129]]
-; SHADERTEST-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP0]], i32 8, !dbg [[DBG129]]
-; SHADERTEST-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(3) [[TMP3]], align 4, !dbg [[DBG129]]
-; SHADERTEST-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP0]], i32 4, !dbg [[DBG129]]
-; SHADERTEST-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(3) [[TMP5]], align 4, !dbg [[DBG129]]
-; SHADERTEST-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(3) [[TMP0]], align 4, !dbg [[DBG129]]
-; SHADERTEST-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr addrspace(3) @Lds.GS, i32 [[ESGSOFFSET1:%.*]], !dbg [[DBG129]]
-; SHADERTEST-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP8]], i32 12, !dbg [[DBG129]]
-; SHADERTEST-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(3) [[TMP9]], align 4, !dbg [[DBG129]]
-; SHADERTEST-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP8]], i32 8, !dbg [[DBG129]]
-; SHADERTEST-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(3) [[TMP11]], align 4, !dbg [[DBG129]]
-; SHADERTEST-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP8]], i32 4, !dbg [[DBG129]]
-; SHADERTEST-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(3) [[TMP13]], align 4, !dbg [[DBG129]]
-; SHADERTEST-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(3) [[TMP8]], align 4, !dbg [[DBG129]]
-; SHADERTEST-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr addrspace(3) @Lds.GS, i32 [[ESGSOFFSET0:%.*]], !dbg [[DBG129]]
-; SHADERTEST-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP16]], i32 12, !dbg [[DBG129]]
-; SHADERTEST-NEXT:    [[TMP18:%.*]] = load i32, ptr addrspace(3) [[TMP17]], align 4, !dbg [[DBG129]]
-; SHADERTEST-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP16]], i32 8, !dbg [[DBG129]]
-; SHADERTEST-NEXT:    [[TMP20:%.*]] = load i32, ptr addrspace(3) [[TMP19]], align 4, !dbg [[DBG129]]
-; SHADERTEST-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP16]], i32 4, !dbg [[DBG129]]
-; SHADERTEST-NEXT:    [[TMP22:%.*]] = load i32, ptr addrspace(3) [[TMP21]], align 4, !dbg [[DBG129]]
-; SHADERTEST-NEXT:    [[TMP23:%.*]] = load i32, ptr addrspace(3) [[TMP16]], align 4, !dbg [[DBG129]]
-; SHADERTEST-NEXT:    [[TMP24:%.*]] = call noundef i64 @llvm.amdgcn.s.getpc() #[[ATTR7]]
+; SHADERTEST-NEXT:    #dbg_value(i32 0, !126, !DIExpression(), !129)
+; SHADERTEST-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr addrspace(3) @Lds.GS, i32 [[ESGSOFFSET2:%.*]], !dbg [[DBG130:![0-9]+]]
+; SHADERTEST-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP0]], i32 12, !dbg [[DBG130]]
+; SHADERTEST-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) [[TMP1]], align 4, !dbg [[DBG130]]
+; SHADERTEST-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP0]], i32 8, !dbg [[DBG130]]
+; SHADERTEST-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(3) [[TMP3]], align 4, !dbg [[DBG130]]
+; SHADERTEST-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP0]], i32 4, !dbg [[DBG130]]
+; SHADERTEST-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(3) [[TMP5]], align 4, !dbg [[DBG130]]
+; SHADERTEST-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(3) [[TMP0]], align 4, !dbg [[DBG130]]
+; SHADERTEST-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr addrspace(3) @Lds.GS, i32 [[ESGSOFFSET1:%.*]], !dbg [[DBG130]]
+; SHADERTEST-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP8]], i32 12, !dbg [[DBG130]]
+; SHADERTEST-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(3) [[TMP9]], align 4, !dbg [[DBG130]]
+; SHADERTEST-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP8]], i32 8, !dbg [[DBG130]]
+; SHADERTEST-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(3) [[TMP11]], align 4, !dbg [[DBG130]]
+; SHADERTEST-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP8]], i32 4, !dbg [[DBG130]]
+; SHADERTEST-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(3) [[TMP13]], align 4, !dbg [[DBG130]]
+; SHADERTEST-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(3) [[TMP8]], align 4, !dbg [[DBG130]]
+; SHADERTEST-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr addrspace(3) @Lds.GS, i32 [[ESGSOFFSET0:%.*]], !dbg [[DBG130]]
+; SHADERTEST-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP16]], i32 12, !dbg [[DBG130]]
+; SHADERTEST-NEXT:    [[TMP18:%.*]] = load i32, ptr addrspace(3) [[TMP17]], align 4, !dbg [[DBG130]]
+; SHADERTEST-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP16]], i32 8, !dbg [[DBG130]]
+; SHADERTEST-NEXT:    [[TMP20:%.*]] = load i32, ptr addrspace(3) [[TMP19]], align 4, !dbg [[DBG130]]
+; SHADERTEST-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP16]], i32 4, !dbg [[DBG130]]
+; SHADERTEST-NEXT:    [[TMP22:%.*]] = load i32, ptr addrspace(3) [[TMP21]], align 4, !dbg [[DBG130]]
+; SHADERTEST-NEXT:    [[TMP23:%.*]] = load i32, ptr addrspace(3) [[TMP16]], align 4, !dbg [[DBG130]]
+; SHADERTEST-NEXT:    [[TMP24:%.*]] = call noundef i64 @llvm.amdgcn.s.getpc() #[[ATTR8]]
 ; SHADERTEST-NEXT:    [[TMP25:%.*]] = and i64 [[TMP24]], -4294967296
 ; SHADERTEST-NEXT:    [[TMP26:%.*]] = zext i32 [[GLOBALTABLE:%.*]] to i64
 ; SHADERTEST-NEXT:    [[TMP27:%.*]] = or disjoint i64 [[TMP25]], [[TMP26]]
@@ -918,47 +943,47 @@ attribute[2].offset = 0
 ; SHADERTEST-NEXT:    [[TMP33:%.*]] = and i32 [[DOTI3]], -491521
 ; SHADERTEST-NEXT:    [[TMP34:%.*]] = or disjoint i32 [[TMP33]], 131072
 ; SHADERTEST-NEXT:    [[TMP35:%.*]] = insertelement <4 x i32> [[DOTUPTO210]], i32 [[TMP34]], i64 3
-; SHADERTEST-NEXT:    #dbg_value(i32 0, !125, !DIExpression(), !128)
-; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32{{(\.v4i32)?}}(i32 [[TMP23]], <4 x i32> [[TMP35]], i32 0, i32 [[GSVSOFFSET:%.*]], i32 20, i32 11), !dbg !{{[0-9]+}}
-; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32{{(\.v4i32)?}}(i32 [[TMP22]], <4 x i32> [[TMP35]], i32 12, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg !{{[0-9]+}}
-; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32{{(\.v4i32)?}}(i32 [[TMP20]], <4 x i32> [[TMP35]], i32 24, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg !{{[0-9]+}}
-; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32{{(\.v4i32)?}}(i32 [[TMP18]], <4 x i32> [[TMP35]], i32 36, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg !{{[0-9]+}}
-; SHADERTEST-NEXT:    call void @llvm.amdgcn.s.sendmsg(i32 34, i32 [[GSWAVEID:%.*]]), !dbg !{{[0-9]+}}
-; SHADERTEST-NEXT:    #dbg_value(i32 1, !125, !DIExpression(), !128)
-; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32{{(\.v4i32)?}}(i32 [[TMP15]], <4 x i32> [[TMP35]], i32 4, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg !{{[0-9]+}}
-; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32{{(\.v4i32)?}}(i32 [[TMP14]], <4 x i32> [[TMP35]], i32 16, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg !{{[0-9]+}}
-; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32{{(\.v4i32)?}}(i32 [[TMP12]], <4 x i32> [[TMP35]], i32 28, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg !{{[0-9]+}}
-; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32{{(\.v4i32)?}}(i32 [[TMP10]], <4 x i32> [[TMP35]], i32 40, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg !{{[0-9]+}}
-; SHADERTEST-NEXT:    call void @llvm.amdgcn.s.sendmsg(i32 34, i32 [[GSWAVEID]]), !dbg !{{[0-9]+}}
-; SHADERTEST-NEXT:    #dbg_value(i32 2, !125, !DIExpression(), !128)
-; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32{{(\.v4i32)?}}(i32 [[TMP7]], <4 x i32> [[TMP35]], i32 8, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg !{{[0-9]+}}
-; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32{{(\.v4i32)?}}(i32 [[TMP6]], <4 x i32> [[TMP35]], i32 20, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg !{{[0-9]+}}
-; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32{{(\.v4i32)?}}(i32 [[TMP4]], <4 x i32> [[TMP35]], i32 32, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg !{{[0-9]+}}
-; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32{{(\.v4i32)?}}(i32 [[TMP2]], <4 x i32> [[TMP35]], i32 44, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg !{{[0-9]+}}
-; SHADERTEST-NEXT:    call void @llvm.amdgcn.s.sendmsg(i32 34, i32 [[GSWAVEID]]), !dbg !{{[0-9]+}}
-; SHADERTEST-NEXT:    #dbg_value(i32 3, !125, !DIExpression(), !128)
-; SHADERTEST-NEXT:    call void @llvm.amdgcn.s.sendmsg(i32 18, i32 [[GSWAVEID]]), !dbg !{{[0-9]+}}
-; SHADERTEST-NEXT:    fence syncscope("agent") release, !dbg !{{[0-9]+}}
-; SHADERTEST-NEXT:    call void @llvm.amdgcn.s.sendmsg(i32 3, i32 [[GSWAVEID]]), !dbg !{{[0-9]+}}
-; SHADERTEST-NEXT:    ret void, !dbg !{{[0-9]+}}
+; SHADERTEST-NEXT:    #dbg_value(i32 0, !126, !DIExpression(), !129)
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32{{(.v4i32)?}}(i32 [[TMP23]], <4 x i32> [[TMP35]], i32 0, i32 [[GSVSOFFSET:%.*]], i32 20, i32 11), !dbg [[DBG130]]
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32{{(.v4i32)?}}(i32 [[TMP22]], <4 x i32> [[TMP35]], i32 12, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg [[DBG130]]
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32{{(.v4i32)?}}(i32 [[TMP20]], <4 x i32> [[TMP35]], i32 24, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg [[DBG130]]
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32{{(.v4i32)?}}(i32 [[TMP18]], <4 x i32> [[TMP35]], i32 36, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg [[DBG130]]
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.s.sendmsg(i32 34, i32 [[GSWAVEID:%.*]]), !dbg [[DBG130]]
+; SHADERTEST-NEXT:    #dbg_value(i32 1, !126, !DIExpression(), !129)
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32{{(.v4i32)?}}(i32 [[TMP15]], <4 x i32> [[TMP35]], i32 4, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg [[DBG130]]
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32{{(.v4i32)?}}(i32 [[TMP14]], <4 x i32> [[TMP35]], i32 16, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg [[DBG130]]
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32{{(.v4i32)?}}(i32 [[TMP12]], <4 x i32> [[TMP35]], i32 28, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg [[DBG130]]
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32{{(.v4i32)?}}(i32 [[TMP10]], <4 x i32> [[TMP35]], i32 40, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg [[DBG130]]
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.s.sendmsg(i32 34, i32 [[GSWAVEID]]), !dbg [[DBG130]]
+; SHADERTEST-NEXT:    #dbg_value(i32 2, !126, !DIExpression(), !129)
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32{{(.v4i32)?}}(i32 [[TMP7]], <4 x i32> [[TMP35]], i32 8, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg [[DBG130]]
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32{{(.v4i32)?}}(i32 [[TMP6]], <4 x i32> [[TMP35]], i32 20, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg [[DBG130]]
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32{{(.v4i32)?}}(i32 [[TMP4]], <4 x i32> [[TMP35]], i32 32, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg [[DBG130]]
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32{{(.v4i32)?}}(i32 [[TMP2]], <4 x i32> [[TMP35]], i32 44, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg [[DBG130]]
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.s.sendmsg(i32 34, i32 [[GSWAVEID]]), !dbg [[DBG130]]
+; SHADERTEST-NEXT:    #dbg_value(i32 3, !126, !DIExpression(), !129)
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.s.sendmsg(i32 18, i32 [[GSWAVEID]]), !dbg [[DBG131:![0-9]+]]
+; SHADERTEST-NEXT:    fence syncscope("agent") release, !dbg [[DBG131]]
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.s.sendmsg(i32 3, i32 [[GSWAVEID]]), !dbg [[DBG131]]
+; SHADERTEST-NEXT:    ret void, !dbg [[DBG131]]
 ;
 ;
 ; SHADERTEST-LABEL: @_amdgpu_vs_main(
-; SHADERTEST-NEXT:    [[TMP1:%.*]] = call noundef i64 @llvm.amdgcn.s.getpc() #[[ATTR7]]
+; SHADERTEST-NEXT:    [[TMP1:%.*]] = call noundef i64 @llvm.amdgcn.s.getpc() #[[ATTR8]]
 ; SHADERTEST-NEXT:    [[TMP2:%.*]] = and i64 [[TMP1]], -4294967296
 ; SHADERTEST-NEXT:    [[TMP3:%.*]] = zext i32 [[GLOBALTABLE:%.*]] to i64
 ; SHADERTEST-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[TMP2]], [[TMP3]]
 ; SHADERTEST-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr addrspace(4)
 ; SHADERTEST-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP5]], i64 128
-; SHADERTEST-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP6]], align 16, !invariant.load !98
+; SHADERTEST-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP6]], align 16, !invariant.load !99
 ; SHADERTEST-NEXT:    [[TMP8:%.*]] = shl i32 [[VERTEXOFFSET:%.*]], 2
-; SHADERTEST-NEXT:    [[TMP9:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32{{(\.v4i32)?}}(<4 x i32> [[TMP7]], i32 [[TMP8]], i32 0, i32 3), !invariant.load !98
+; SHADERTEST-NEXT:    [[TMP9:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32{{(.v4i32)?}}(<4 x i32> [[TMP7]], i32 [[TMP8]], i32 0, i32 3), !invariant.load !99
 ; SHADERTEST-NEXT:    [[TMP10:%.*]] = add i32 [[TMP8]], 192
-; SHADERTEST-NEXT:    [[TMP11:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32{{(\.v4i32)?}}(<4 x i32> [[TMP7]], i32 [[TMP10]], i32 0, i32 3), !invariant.load !98
+; SHADERTEST-NEXT:    [[TMP11:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32{{(.v4i32)?}}(<4 x i32> [[TMP7]], i32 [[TMP10]], i32 0, i32 3), !invariant.load !99
 ; SHADERTEST-NEXT:    [[TMP12:%.*]] = add i32 [[TMP8]], 384
-; SHADERTEST-NEXT:    [[TMP13:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32{{(\.v4i32)?}}(<4 x i32> [[TMP7]], i32 [[TMP12]], i32 0, i32 3), !invariant.load !98
+; SHADERTEST-NEXT:    [[TMP13:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32{{(.v4i32)?}}(<4 x i32> [[TMP7]], i32 [[TMP12]], i32 0, i32 3), !invariant.load !99
 ; SHADERTEST-NEXT:    [[TMP14:%.*]] = add i32 [[TMP8]], 576
-; SHADERTEST-NEXT:    [[TMP15:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32{{(\.v4i32)?}}(<4 x i32> [[TMP7]], i32 [[TMP14]], i32 0, i32 3), !invariant.load !98
+; SHADERTEST-NEXT:    [[TMP15:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32{{(.v4i32)?}}(<4 x i32> [[TMP7]], i32 [[TMP14]], i32 0, i32 3), !invariant.load !99
 ; SHADERTEST-NEXT:    call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float [[TMP9]], float [[TMP11]], float [[TMP13]], float [[TMP15]], i1 true, i1 false)
 ; SHADERTEST-NEXT:    ret void
 ;
@@ -972,15 +997,16 @@ attribute[2].offset = 0
 ; SHADERTEST: attributes #[[ATTR2:[0-9]+]] = { nounwind willreturn memory(none) }
 ; SHADERTEST: attributes #[[ATTR3:[0-9]+]] = { nounwind }
 ; SHADERTEST: attributes #[[ATTR4:[0-9]+]] = { nounwind willreturn memory(read) }
-; SHADERTEST: attributes #[[ATTR5:[0-9]+]] = { "target-features"=",+wavefrontsize64" }
-; SHADERTEST: attributes #[[ATTR6:[0-9]+]] = { memory(readwrite) "InitialPSInputAddr"="0" "amdgpu-color-export"="0" "amdgpu-depth-export"="0" "amdgpu-memory-bound"="false" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "target-features"=",+wavefrontsize64" }
-; SHADERTEST: attributes #[[ATTR7]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; SHADERTEST: attributes #[[ATTR8]] = { nocallback nofree nosync nounwind willreturn memory(read) }
-; SHADERTEST: attributes #[[ATTR9:[0-9]+]] = { nounwind memory(none) }
-; SHADERTEST: attributes #[[ATTR10:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
-; SHADERTEST: attributes #[[ATTR11:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
-; SHADERTEST: attributes #[[ATTR12]] = { nocallback nofree nosync nounwind willreturn memory(write) }
-; SHADERTEST: attributes #[[ATTR13:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+; SHADERTEST: attributes #[[ATTR5:[0-9]+]] = { nounwind memory(write) }
+; SHADERTEST: attributes #[[ATTR6:[0-9]+]] = { "target-features"=",+wavefrontsize64" }
+; SHADERTEST: attributes #[[ATTR7:[0-9]+]] = { memory(readwrite) "InitialPSInputAddr"="0" "amdgpu-color-export"="0" "amdgpu-depth-export"="0" "amdgpu-memory-bound"="false" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "target-features"=",+wavefrontsize64" }
+; SHADERTEST: attributes #[[ATTR8]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; SHADERTEST: attributes #[[ATTR9]] = { nocallback nofree nosync nounwind willreturn memory(read) }
+; SHADERTEST: attributes #[[ATTR10:[0-9]+]] = { nounwind memory(none) }
+; SHADERTEST: attributes #[[ATTR11:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; SHADERTEST: attributes #[[ATTR12:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
+; SHADERTEST: attributes #[[ATTR13:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(write) }
+; SHADERTEST: attributes #[[ATTR14:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
 ;.
 ; SHADERTEST: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "spirv", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !2)
 ; SHADERTEST: [[META1:![0-9]+]] = !DIFile(filename: "test.vert", directory: ".", source: "// OpModuleProcessed client vulkan100\0A// OpModuleProcessed target-env vulkan1.0\0A// OpModuleProcessed entry-point main\0A#line 1\0A#version 450\0Alayout(location = 0) in vec4 position
@@ -1072,47 +1098,48 @@ attribute[2].offset = 0
 ; SHADERTEST: [[META87:![0-9]+]] = !{i32 1, i32 0, i32 0, i32 16, i32 11, i32 7}
 ; SHADERTEST: [[META88:![0-9]+]] = !{i32 2, i32 0, i32 0, i32 16, i32 11, i32 7}
 ; SHADERTEST: [[META89:![0-9]+]] = !{i32 10}
-; SHADERTEST: [[META90:![0-9]+]] = !{!"\82\B0amdpal.pipelines{{.*}}AEamdpal.version\92\03\00"}
-; SHADERTEST: [[META91:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4}
-; SHADERTEST: [[META92:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
-; SHADERTEST: [[META93:![0-9]+]] = !{i32 0, i32 0, i32 0, i32 0, i32 3, i32 3}
-; SHADERTEST: [[META94:![0-9]+]] = !{i32 2, i32 1, i32 1}
-; SHADERTEST: [[META95:![0-9]+]] = !{i32 3, i32 4, i32 1, i32 3}
-; SHADERTEST: [[META96:![0-9]+]] = distinct !DISubprogram(name: "lgc.shader.LSHS.main", scope: !78, file: !78, type: !97, flags: DIFlagArtificial, spFlags: DISPFlagDefinition, unit: !79)
-; SHADERTEST: [[META97:![0-9]+]] = !DISubroutineType(types: !98)
-; SHADERTEST: [[META98:![0-9]+]] = !{}
-; SHADERTEST: [[META99:![0-9]+]] = !{i32 2}
-; SHADERTEST: [[DBG100]] = !DILocation(line: 0, scope: !96)
-; SHADERTEST: [[META101:![0-9]+]] = distinct !DISubprogram(name: "lgc.shader.ESGS.main", scope: !78, file: !78, type: !97, flags: DIFlagArtificial, spFlags: DISPFlagDefinition, unit: !77)
-; SHADERTEST: [[META102:![0-9]+]] = !{i32 4}
-; SHADERTEST: [[DBG103]] = !DILocation(line: 0, scope: !101)
-; SHADERTEST: [[META104:![0-9]+]] = distinct !DISubprogram(name: "main", linkageName: "main", scope: null, file: !1, type: !105, flags: DIFlagPublic, spFlags: DISPFlagDefinition, unit: !0, templateParams: !98)
-; SHADERTEST: [[META105:![0-9]+]] = !DISubroutineType(flags: DIFlagPublic, types: !106)
-; SHADERTEST: [[META106:![0-9]+]] = !{null}
-; SHADERTEST: [[META107:![0-9]+]] = !{i32 0}
-; SHADERTEST: [[DBG108]] = !DILocation(line: 7, scope: !104)
-; SHADERTEST: [[META109:![0-9]+]] = distinct !DISubprogram(name: "main", linkageName: "main", scope: null, file: !12, type: !105, flags: DIFlagPublic, spFlags: DISPFlagDefinition, unit: !11, templateParams: !98)
-; SHADERTEST: [[META110:![0-9]+]] = !{i32 1}
-; SHADERTEST: [[DBG111]] = !DILocation(line: 6, scope: !109)
-; SHADERTEST: [[DBG112]] = !DILocation(line: 7, scope: !109)
-; SHADERTEST: [[DBG113]] = !DILocation(line: 8, scope: !109)
-; SHADERTEST: [[DBG114]] = !DILocation(line: 9, scope: !109)
-; SHADERTEST: [[DBG115]] = !DILocation(line: 11, scope: !109)
-; SHADERTEST: [[META116:![0-9]+]] = distinct !DISubprogram(name: "main", linkageName: "main", scope: null, file: !45, type: !105, flags: DIFlagPublic, spFlags: DISPFlagDefinition, unit: !44, templateParams: !98, retainedNodes: !117)
-; SHADERTEST: [[META117:![0-9]+]] = !{!118, !119, !120}
-; SHADERTEST: [[META118:![0-9]+]] = !DILocalVariable(name: "u", scope: !116, file: !45, line: 6, type: !6)
-; SHADERTEST: [[META119:![0-9]+]] = !DILocalVariable(name: "v", scope: !116, file: !45, line: 7, type: !6)
-; SHADERTEST: [[META120:![0-9]+]] = !DILocalVariable(name: "w", scope: !116, file: !45, line: 8, type: !6)
-; SHADERTEST: [[DBG121]] = !DILocation(line: 9, scope: !116)
-; SHADERTEST: [[META122:![0-9]+]] = !DILocation(line: 0, scope: !116)
-; SHADERTEST: [[META123:![0-9]+]] = distinct !DISubprogram(name: "main", linkageName: "main", scope: null, file: !62, type: !105, flags: DIFlagPublic, spFlags: DISPFlagDefinition, unit: !61, templateParams: !98, retainedNodes: !124)
-; SHADERTEST: [[META124:![0-9]+]] = !{!125}
-; SHADERTEST: [[META125:![0-9]+]] = !DILocalVariable(name: "i", scope: !123, file: !62, line: 7, type: !126)
-; SHADERTEST: [[META126:![0-9]+]] = !DIBasicType(name: "uint", size: 32, encoding: DW_ATE_unsigned)
-; SHADERTEST: [[META127:![0-9]+]] = !{i32 3}
-; SHADERTEST: [[META128:![0-9]+]] = !DILocation(line: 0, scope: !123)
-; SHADERTEST: !{{[0-9]+}} = !DILocation(line: 10, scope: !123)
-; SHADERTEST: !{{[0-9]+}} = !DILocation(line: 12, scope: !123)
-; SHADERTEST: [[META131:![0-9]+]] = !{i32 8}
-; SHADERTEST: [[META132:![0-9]+]] = !{i32 6}
+; SHADERTEST: [[META90:![0-9]+]] = !{i32 32, i32 64, i32 64, i32 64, i32 64, i32 32, i32 64, i32 32}
+; SHADERTEST: [[META91:![0-9]+]] = !{!"\82\B0amdpal.pipelines{{.*}}AEamdpal.version\92\03\00"}
+; SHADERTEST: [[META92:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4}
+; SHADERTEST: [[META93:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
+; SHADERTEST: [[META94:![0-9]+]] = !{i32 0, i32 0, i32 0, i32 0, i32 3, i32 3}
+; SHADERTEST: [[META95:![0-9]+]] = !{i32 2, i32 1, i32 1}
+; SHADERTEST: [[META96:![0-9]+]] = !{i32 3, i32 4, i32 1, i32 3}
+; SHADERTEST: [[META97:![0-9]+]] = distinct !DISubprogram(name: "lgc.shader.LSHS.main", scope: !78, file: !78, type: !98, flags: DIFlagArtificial, spFlags: DISPFlagDefinition, unit: !79)
+; SHADERTEST: [[META98:![0-9]+]] = !DISubroutineType(types: !99)
+; SHADERTEST: [[META99:![0-9]+]] = !{}
+; SHADERTEST: [[META100:![0-9]+]] = !{i32 2}
+; SHADERTEST: [[DBG101]] = !DILocation(line: 0, scope: !97)
+; SHADERTEST: [[META102:![0-9]+]] = distinct !DISubprogram(name: "lgc.shader.ESGS.main", scope: !78, file: !78, type: !98, flags: DIFlagArtificial, spFlags: DISPFlagDefinition, unit: !77)
+; SHADERTEST: [[META103:![0-9]+]] = !{i32 4}
+; SHADERTEST: [[DBG104]] = !DILocation(line: 0, scope: !102)
+; SHADERTEST: [[META105:![0-9]+]] = distinct !DISubprogram(name: "main", linkageName: "main", scope: null, file: !1, type: !106, flags: DIFlagPublic, spFlags: DISPFlagDefinition, unit: !0, templateParams: !99)
+; SHADERTEST: [[META106:![0-9]+]] = !DISubroutineType(flags: DIFlagPublic, types: !107)
+; SHADERTEST: [[META107:![0-9]+]] = !{null}
+; SHADERTEST: [[META108:![0-9]+]] = !{i32 0}
+; SHADERTEST: [[DBG109]] = !DILocation(line: 7, scope: !105)
+; SHADERTEST: [[META110:![0-9]+]] = distinct !DISubprogram(name: "main", linkageName: "main", scope: null, file: !12, type: !106, flags: DIFlagPublic, spFlags: DISPFlagDefinition, unit: !11, templateParams: !99)
+; SHADERTEST: [[META111:![0-9]+]] = !{i32 1}
+; SHADERTEST: [[DBG112]] = !DILocation(line: 6, scope: !110)
+; SHADERTEST: [[DBG113]] = !DILocation(line: 7, scope: !110)
+; SHADERTEST: [[DBG114]] = !DILocation(line: 8, scope: !110)
+; SHADERTEST: [[DBG115]] = !DILocation(line: 9, scope: !110)
+; SHADERTEST: [[DBG116]] = !DILocation(line: 11, scope: !110)
+; SHADERTEST: [[META117:![0-9]+]] = distinct !DISubprogram(name: "main", linkageName: "main", scope: null, file: !45, type: !106, flags: DIFlagPublic, spFlags: DISPFlagDefinition, unit: !44, templateParams: !99, retainedNodes: !118)
+; SHADERTEST: [[META118:![0-9]+]] = !{!119, !120, !121}
+; SHADERTEST: [[META119:![0-9]+]] = !DILocalVariable(name: "u", scope: !117, file: !45, line: 6, type: !6)
+; SHADERTEST: [[META120:![0-9]+]] = !DILocalVariable(name: "v", scope: !117, file: !45, line: 7, type: !6)
+; SHADERTEST: [[META121:![0-9]+]] = !DILocalVariable(name: "w", scope: !117, file: !45, line: 8, type: !6)
+; SHADERTEST: [[DBG122]] = !DILocation(line: 9, scope: !117)
+; SHADERTEST: [[META123:![0-9]+]] = !DILocation(line: 0, scope: !117)
+; SHADERTEST: [[META124:![0-9]+]] = distinct !DISubprogram(name: "main", linkageName: "main", scope: null, file: !62, type: !106, flags: DIFlagPublic, spFlags: DISPFlagDefinition, unit: !61, templateParams: !99, retainedNodes: !125)
+; SHADERTEST: [[META125:![0-9]+]] = !{!126}
+; SHADERTEST: [[META126:![0-9]+]] = !DILocalVariable(name: "i", scope: !124, file: !62, line: 7, type: !127)
+; SHADERTEST: [[META127:![0-9]+]] = !DIBasicType(name: "uint", size: 32, encoding: DW_ATE_unsigned)
+; SHADERTEST: [[META128:![0-9]+]] = !{i32 3}
+; SHADERTEST: [[META129:![0-9]+]] = !DILocation(line: 0, scope: !124)
+; SHADERTEST: [[DBG130]] = !DILocation(line: 10, scope: !124)
+; SHADERTEST: [[DBG131]] = !DILocation(line: 12, scope: !124)
+; SHADERTEST: [[META132:![0-9]+]] = !{i32 8}
+; SHADERTEST: [[META133:![0-9]+]] = !{i32 6}
 ;.
diff --git a/llpc/test/shaderdb/debug_info/PipelineGs_TestVsGSMergeShader.pipe b/llpc/test/shaderdb/debug_info/PipelineGs_TestVsGSMergeShader.pipe
index 308d2fec90..05f228a001 100644
--- a/llpc/test/shaderdb/debug_info/PipelineGs_TestVsGSMergeShader.pipe
+++ b/llpc/test/shaderdb/debug_info/PipelineGs_TestVsGSMergeShader.pipe
@@ -342,7 +342,7 @@ attribute[0].offset = 0
 ; SHADERTEST-LABEL: define dllexport amdgpu_vs void @_amdgpu_vs_main(
 ; SHADERTEST-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr addrspace(3) @Lds.GS, i32 [[VERTEXOFFSET:%.*]]
 ; SHADERTEST-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP1]], i32 1280
-; SHADERTEST-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr addrspace(3) [[TMP2]], align 4, !invariant.load !49
+; SHADERTEST-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr addrspace(3) [[TMP2]], align 4, !invariant.load !50
 ; SHADERTEST-NEXT:    [[DOTI3:%.*]] = extractelement <4 x float> [[TMP3]], i64 3
 ; SHADERTEST-NEXT:    [[DOTI2:%.*]] = extractelement <4 x float> [[TMP3]], i64 2
 ; SHADERTEST-NEXT:    [[DOTI1:%.*]] = extractelement <4 x float> [[TMP3]], i64 1
@@ -416,20 +416,20 @@ attribute[0].offset = 0
 ; SHADERTEST: [[META44:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4}
 ; SHADERTEST: [[META45:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
 ; SHADERTEST: [[META46:![0-9]+]] = !{i32 0, i32 0, i32 1, i32 1}
-; SHADERTEST: [[META47:![0-9]+]] = distinct !DISubprogram(name: "lgc.shader.ESGS.main", scope: !34, file: !34, type: !48, flags: DIFlagArtificial, spFlags: DISPFlagDefinition, unit: !33)
-; SHADERTEST: [[META48:![0-9]+]] = !DISubroutineType(types: !49)
+; SHADERTEST: [[META47:![0-9]+]] = distinct !DISubprogram(name: "lgc.shader.ESGS.main", scope: !34, file: !34, type: !49, flags: DIFlagArtificial, spFlags: DISPFlagDefinition, unit: !33)
+; SHADERTEST: [[META48:![0-9]+]] = !DISubroutineType(types: !50)
 ; SHADERTEST: [[META49:![0-9]+]] = !{}
 ; SHADERTEST: [[META50:![0-9]+]] = !{i32 4}
-; SHADERTEST: [[DBG51]] = !DILocation(line: 0, scope: !47)
-; SHADERTEST: [[META52:![0-9]+]] = distinct !DISubprogram(name: "main", linkageName: "main", scope: null, file: !1, type: !53, flags: DIFlagPublic, spFlags: DISPFlagDefinition, unit: !0, templateParams: !49)
-; SHADERTEST: [[META53:![0-9]+]] = !DISubroutineType(flags: DIFlagPublic, types: !54)
+; SHADERTEST: [[DBG51]] = !DILocation(line: 0, scope: !48)
+; SHADERTEST: [[META52:![0-9]+]] = distinct !DISubprogram(name: "main", linkageName: "main", scope: null, file: !1, type: !54, flags: DIFlagPublic, spFlags: DISPFlagDefinition, unit: !0, templateParams: !50)
+; SHADERTEST: [[META53:![0-9]+]] = !DISubroutineType(flags: DIFlagPublic, types: !55)
 ; SHADERTEST: [[META54:![0-9]+]] = !{null}
 ; SHADERTEST: [[META55:![0-9]+]] = !{i32 0}
-; SHADERTEST: [[DBG56]] = !DILocation(line: 7, scope: !52)
-; SHADERTEST: [[META57:![0-9]+]] = distinct !DISubprogram(name: "main", linkageName: "main", scope: null, file: !12, type: !53, flags: DIFlagPublic, spFlags: DISPFlagDefinition, unit: !11, templateParams: !49)
+; SHADERTEST: [[DBG56]] = !DILocation(line: 7, scope: !53)
+; SHADERTEST: [[META57:![0-9]+]] = distinct !DISubprogram(name: "main", linkageName: "main", scope: null, file: !12, type: !54, flags: DIFlagPublic, spFlags: DISPFlagDefinition, unit: !11, templateParams: !50)
 ; SHADERTEST: [[META58:![0-9]+]] = !{i32 3}
-; SHADERTEST: [[DBG59]] = !DILocation(line: 9, scope: !57)
-; SHADERTEST: [[DBG60]] = !DILocation(line: 10, scope: !57)
+; SHADERTEST: [[DBG59]] = !DILocation(line: 9, scope: !58)
+; SHADERTEST: [[DBG60]] = !DILocation(line: 10, scope: !58)
 ; SHADERTEST: [[META61:![0-9]+]] = !{i32 8}
 ; SHADERTEST: [[META62:![0-9]+]] = !{i32 6}
 ;.
diff --git a/llpc/test/shaderdb/extensions/ExtShaderFloat16_TestInterpFuncs_lit.frag b/llpc/test/shaderdb/extensions/ExtShaderFloat16_TestInterpFuncs_lit.frag
index 6e1a81380b..a0f8bfaa3f 100644
--- a/llpc/test/shaderdb/extensions/ExtShaderFloat16_TestInterpFuncs_lit.frag
+++ b/llpc/test/shaderdb/extensions/ExtShaderFloat16_TestInterpFuncs_lit.frag
@@ -14,17 +14,15 @@ void main()
 
     fragColor = f16v2;
 }
+
 // BEGIN_SHADERTEST
 /*
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline before-patching results
-; SHADERTEST: %{{[A-Za-z0-9]*}} = call reassoc nnan nsz arcp contract afn <2 x float> @lgc.input.import.builtin.InterpPerspCentroid.v2f32.i32(i32 {{.*}})
-; SHADERTEST: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x half> (...) @lgc.input.import.interpolated__v4f16(i1 false, i32 0, i32 0, i32 0, i32 poison, i32 0, <2 x float> %{{.*}})
-; SHADERTEST: = call reassoc nnan nsz arcp contract afn <2 x float> @lgc.input.import.builtin.SamplePosOffset.v2f32.i32.i32(
-; SHADERTEST: = call reassoc nnan nsz arcp contract afn <3 x float> @lgc.input.import.builtin.InterpPullMode
-; SHADERTEST-COUNT-12: = call i32 @llvm.amdgcn.mov.dpp.i32(i32
+; SHADERTEST-COUNT-3:call reassoc nnan nsz arcp contract afn <4 x half> (...) @lgc.input.import.interpolated__v4f16(i1 false,
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
+; SHADERTEST-COUNT-12: = call i32 @llvm.amdgcn.mov.dpp.i32(i32
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/extensions/ExtShaderInt8_TestSharedVarLoadStore_lit.comp b/llpc/test/shaderdb/extensions/ExtShaderInt8_TestSharedVarLoadStore_lit.comp
index ab2da40227..4c43ca83ff 100644
--- a/llpc/test/shaderdb/extensions/ExtShaderInt8_TestSharedVarLoadStore_lit.comp
+++ b/llpc/test/shaderdb/extensions/ExtShaderInt8_TestSharedVarLoadStore_lit.comp
@@ -48,8 +48,8 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: load i8, ptr addrspace(3) @{{.*}}, align {{1|4}}
 ; SHADERTEST: store i8 %{{[0-9]*}}, ptr addrspace(3) @{{.*}}, align {{1|4}}
-; SHADERTEST-COUNT-3: load <{{[2-4]}} x i8>, ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 {{14|28|44}}), align {{[2|4]}}
-; SHADERTEST-COUNT-3: store <{{[2-4]}} x i8> %{{[0-9]*}}, ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 {{14|28|44}}), align {{[2|4]}}
+; SHADERTEST-COUNT-3: load <{{[2-4]}} x i8>, ptr addrspace(3) getelementptr inbounds {{(nuw )?}}(i8, ptr addrspace(3) @{{.*}}, i32 {{14|28|44}}), align {{[2|4]}}
+; SHADERTEST-COUNT-3: store <{{[2-4]}} x i8> %{{[0-9]*}}, ptr addrspace(3) getelementptr inbounds {{(nuw )?}}(i8, ptr addrspace(3) @{{.*}}, i32 {{14|28|44}}), align {{[2|4]}}
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/extensions/ExtXfb_TessGsDoubleOutput_lit.geom b/llpc/test/shaderdb/extensions/ExtXfb_TessGsDoubleOutput_lit.geom
index 92cbe9911e..56fbe730fc 100644
--- a/llpc/test/shaderdb/extensions/ExtXfb_TessGsDoubleOutput_lit.geom
+++ b/llpc/test/shaderdb/extensions/ExtXfb_TessGsDoubleOutput_lit.geom
@@ -26,9 +26,9 @@ void main()
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
-; SHADERTEST: call void @lgc.output.export.xfb{{.*}}v3f64
+; SHADERTEST: call void (...) @lgc.write.xfb.output({{.*}}<3 x double>
 ; SHADERTEST: call void @lgc.output.export.generic{{.*}}v3f64
-; SHADERTEST: call void @lgc.output.export.xfb{{.*}}v2f64
+; SHADERTEST: call void (...) @lgc.write.xfb.output({{.*}}<2 x double>
 ; SHADERTEST: call void @lgc.output.export.generic{{.*}}v2f64
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/extensions/ExtXfb_TestGsFloatOutput_lit.geom b/llpc/test/shaderdb/extensions/ExtXfb_TestGsFloatOutput_lit.geom
index 4dec233a77..ff4e64d3bd 100644
--- a/llpc/test/shaderdb/extensions/ExtXfb_TestGsFloatOutput_lit.geom
+++ b/llpc/test/shaderdb/extensions/ExtXfb_TestGsFloatOutput_lit.geom
@@ -26,9 +26,9 @@ void main()
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
-; SHADERTEST: call void @lgc.output.export.xfb{{.*}}v3f32
+; SHADERTEST: call void (...) @lgc.write.xfb.output({{.*}}<3 x float>
 ; SHADERTEST: call void @lgc.output.export.generic{{.*}}v3f32
-; SHADERTEST: call void @lgc.output.export.xfb{{.*}}v2f32
+; SHADERTEST: call void (...) @lgc.write.xfb.output({{.*}}<2 x float>
 ; SHADERTEST: call void @lgc.output.export.generic{{.*}}v2f32
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/extensions/ExtXfb_TestTesDoubleOutput_lit.tese b/llpc/test/shaderdb/extensions/ExtXfb_TestTesDoubleOutput_lit.tese
index 485d3d1670..472fcd35a0 100644
--- a/llpc/test/shaderdb/extensions/ExtXfb_TestTesDoubleOutput_lit.tese
+++ b/llpc/test/shaderdb/extensions/ExtXfb_TestTesDoubleOutput_lit.tese
@@ -16,9 +16,9 @@ void main(void)
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
-; SHADERTEST: call void @lgc.output.export.xfb{{.*}}v3f64
+; SHADERTEST: call void (...) @lgc.write.xfb.output({{.*}}<3 x double>
 ; SHADERTEST: call void @lgc.output.export.generic{{.*}}v3f64
-; SHADERTEST: call void @lgc.output.export.xfb{{.*}}v2f64
+; SHADERTEST: call void (...) @lgc.write.xfb.output({{.*}}<2 x double>
 ; SHADERTEST: call void @lgc.output.export.generic{{.*}}v2f64
 ; SHADERTEST: .vgt_strmout_buffer_config:
 ; SHADERTEST:   .stream_0_buffer_en: 0x0000000000000003
diff --git a/llpc/test/shaderdb/extensions/ExtXfb_TestTesFloatOutput_lit.tese b/llpc/test/shaderdb/extensions/ExtXfb_TestTesFloatOutput_lit.tese
index 99f7982277..42c1e5447e 100644
--- a/llpc/test/shaderdb/extensions/ExtXfb_TestTesFloatOutput_lit.tese
+++ b/llpc/test/shaderdb/extensions/ExtXfb_TestTesFloatOutput_lit.tese
@@ -16,9 +16,9 @@ void main(void)
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
-; SHADERTEST: call void @lgc.output.export.xfb{{.*}}v3f32
+; SHADERTEST: call void (...) @lgc.write.xfb.output({{.*}}<3 x float>
 ; SHADERTEST: call void @lgc.output.export.generic{{.*}}v3f32
-; SHADERTEST: call void @lgc.output.export.xfb{{.*}}v2f32
+; SHADERTEST: call void (...) @lgc.write.xfb.output({{.*}}<2 x float>
 ; SHADERTEST: call void @lgc.output.export.generic{{.*}}v2f32
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/extensions/ExtXfb_TestVsDoubleOutput_lit.vert b/llpc/test/shaderdb/extensions/ExtXfb_TestVsDoubleOutput_lit.vert
index 9cb42fad73..fa02a91e75 100644
--- a/llpc/test/shaderdb/extensions/ExtXfb_TestVsDoubleOutput_lit.vert
+++ b/llpc/test/shaderdb/extensions/ExtXfb_TestVsDoubleOutput_lit.vert
@@ -14,9 +14,9 @@ void main()
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
-; SHADERTEST: call void @lgc.output.export.xfb{{.*}}v3f64
+; SHADERTEST: call void (...) @lgc.write.xfb.output({{.*}}<3 x double>
 ; SHADERTEST: call void @lgc.output.export.generic{{.*}}v3f64
-; SHADERTEST: call void @lgc.output.export.xfb{{.*}}v2f64
+; SHADERTEST: call void (...) @lgc.write.xfb.output({{.*}}<2 x double>
 ; SHADERTEST: call void @lgc.output.export.generic{{.*}}v2f64
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/extensions/ExtXfb_TestVsFloatOutput_lit.vert b/llpc/test/shaderdb/extensions/ExtXfb_TestVsFloatOutput_lit.vert
index 9144e1d3f1..c42087575f 100644
--- a/llpc/test/shaderdb/extensions/ExtXfb_TestVsFloatOutput_lit.vert
+++ b/llpc/test/shaderdb/extensions/ExtXfb_TestVsFloatOutput_lit.vert
@@ -14,9 +14,9 @@ void main()
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
-; SHADERTEST: call void @lgc.output.export.xfb{{.*}}v3f32
+; SHADERTEST: call void (...) @lgc.write.xfb.output({{.*}}<3 x float>
 ; SHADERTEST: call void @lgc.output.export.generic{{.*}}v3f32
-; SHADERTEST: call void @lgc.output.export.xfb{{.*}}v2f32
+; SHADERTEST: call void (...) @lgc.write.xfb.output({{.*}}<2 x float>
 ; SHADERTEST: call void @lgc.output.export.generic{{.*}}v2f32
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateAtOffset_lit.frag b/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateAtOffset_lit.frag
index 85685c4074..4788fe0532 100644
--- a/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateAtOffset_lit.frag
+++ b/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateAtOffset_lit.frag
@@ -25,13 +25,9 @@ void main()
 ; SHADERTEST: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn float @interpolateAtOffset.f32.p64.v2f32(ptr addrspace(64) @{{.*}}, <2 x float> %{{.*}})
 ; SHADERTEST: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> @interpolateAtOffset.v4f32.p64.v2f32(ptr addrspace(64) @{{.*}}, <2 x float> %{{.*}})
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline before-patching results
-; SHADERTEST: = call reassoc nnan nsz arcp contract afn <3 x float> @lgc.input.import.builtin.InterpPullMode
-; SHADERTEST-COUNT-12: = call i32 @llvm.amdgcn.mov.dpp.i32(i32
-; SHADERTEST: fmul reassoc nnan nsz arcp contract afn <3 x float>
-; SHADERTEST: fadd reassoc nnan nsz arcp contract afn <3 x float>
-; SHADERTEST: fmul reassoc nnan nsz arcp contract afn <3 x float>
-; SHADERTEST: fadd reassoc nnan nsz arcp contract afn <3 x float>
-; SHADERTEST: = call reassoc nnan nsz arcp contract afn float (...) @lgc.input.import.interpolated__f32(
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <2 x float> (...) @lgc.eval.Ij.offset.smooth__v2f32(<2 x float>
+; SHADERTEST: call reassoc nnan nsz arcp contract afn float (...) @lgc.input.import.interpolated__f32(i1 false, i32 0, i32 0, i32 0, i32 poison, i32 0, <2 x float> 
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.input.import.interpolated__v4f32(i1 false
 ; SHADERTEST-LABEL: _amdgpu_ps_main
 ; SHADERTEST-COUNT-6: v_fmac_f32_e32
 ; SHADERTEST: AMDLLPC SUCCESS
diff --git a/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateAtSample_lit.frag b/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateAtSample_lit.frag
index dcec685faf..ba582c3bfd 100644
--- a/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateAtSample_lit.frag
+++ b/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateAtSample_lit.frag
@@ -25,11 +25,10 @@ void main()
 ; SHADERTEST: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn float @interpolateAtSample.f32.p64.i32(ptr addrspace(64) @{{.*}}, i32 %{{.*}})
 ; SHADERTEST: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> @interpolateAtSample.v4f32.p64.i32(ptr addrspace(64) @{{.*}}, i32 %{{.*}})
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline before-patching results
-; SHADERTEST-DAG: = call reassoc nnan nsz arcp contract afn <2 x float> @lgc.input.import.builtin.SamplePosOffset.v2f32.i32.i32(
-; SHADERTEST-DAG: = call reassoc nnan nsz arcp contract afn <3 x float> @lgc.input.import.builtin.InterpPullMode.v3f32.i32(
-; SHADERTEST-COUNT-12: = call i32 @llvm.amdgcn.mov.dpp.i32(i32
-; SHADERTEST-DAG: = call reassoc nnan nsz arcp contract afn float (...) @lgc.input.import.interpolated__f32(i1 false, i32 0, i32 0, i32 0, i32 poison, i32 0, <2 x float>
-; SHADERTEST-DAG: = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.input.import.interpolated__v4f32(i1 false, i32 1, i32 0, i32 0, i32 poison, i32 1, i32 poison
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <2 x float> @lgc.input.import.builtin.SamplePosOffset.v2f32.i32.i32(i32 268435463
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <2 x float> (...) @lgc.eval.Ij.offset.smooth__v2f32(<2 x float>
+; SHADERTEST: call reassoc nnan nsz arcp contract afn float (...) @lgc.input.import.interpolated__f32(i1 false
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.input.import.interpolated__v4f32(i1 false
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: %{{[0-9]*}} = call float @llvm.amdgcn.interp.p1(float %{{.*}}, i32 0, i32 0, i32 %{{.*}})
 ; SHADERTEST: %{{[0-9]*}} = call float @llvm.amdgcn.interp.p2(float %{{.*}}, float %{{.*}}, i32 0, i32 0, i32 %{{.*}})
diff --git a/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdx1DArray.frag b/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdx1DArray.frag
index 5b0a9d7164..178c858f48 100644
--- a/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdx1DArray.frag
+++ b/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdx1DArray.frag
@@ -17,12 +17,7 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
 ; SHADERTEST: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> @interpolateAtOffset.v4f32.p64.v2f32(ptr addrspace(64) %{{.*}}, <2 x float> {{.*}})
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline before-patching results
-; SHADERTEST: = call reassoc nnan nsz arcp contract afn <3 x float> @lgc.input.import.builtin.InterpPullMode
-; SHADERTEST-COUNT-12: = call i32 @llvm.amdgcn.mov.dpp.i32(i32
-; SHADERTEST-DAG: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.input.import.interpolated__v4f32(i1 false, i32 17, i32 0, i32 0, i32 poison, i32 0, <2 x float> %{{.*}})
-; SHADERTEST-DAG: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.input.import.interpolated__v4f32(i1 false, i32 18, i32 0, i32 0, i32 poison, i32 0, <2 x float> %{{.*}})
-; SHADERTEST-DAG: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.input.import.interpolated__v4f32(i1 false, i32 19, i32 0, i32 0, i32 poison, i32 0, <2 x float> %{{.*}})
-; SHADERTEST-DAG: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.input.import.interpolated__v4f32(i1 false, i32 20, i32 0, i32 0, i32 poison, i32 0, <2 x float> %{{.*}})
+; SHADERTEST-COUNT-4: call reassoc nnan nsz arcp contract afn <2 x float> (...) @lgc.eval.Ij.offset.smooth__v2f32(<2 x float>
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdx1DArrayInStruct.frag b/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdx1DArrayInStruct.frag
index 68a4490924..be8e131210 100644
--- a/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdx1DArrayInStruct.frag
+++ b/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdx1DArrayInStruct.frag
@@ -28,13 +28,7 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
 ; SHADERTEST: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> @interpolateAtOffset.v4f32.p64.v2f32({{<4 x float> addrspace\(64\)\*|ptr addrspace\(64\)}} %{{.*}}, <2 x float> {{.*}})
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline before-patching results
-; SHADERTEST: = call reassoc nnan nsz arcp contract afn <3 x float> @lgc.input.import.builtin.InterpPullMode
-; SHADERTEST-COUNT-12: = call i32 @llvm.amdgcn.mov.dpp.i32(i32
-; SHADERTEST-DAG: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.input.import.interpolated__v4f32(i1 false, i32 3, i32 0, i32 0, i32 poison, i32 0, <2 x float> %{{.*}})
-; SHADERTEST-DAG: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.input.import.interpolated__v4f32(i1 false, i32 4, i32 0, i32 0, i32 poison, i32 0, <2 x float> %{{.*}})
-; SHADERTEST-DAG: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.input.import.interpolated__v4f32(i1 false, i32 5, i32 0, i32 0, i32 poison, i32 0, <2 x float> %{{.*}})
-; SHADERTEST-DAG: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.input.import.interpolated__v4f32(i1 false, i32 6, i32 0, i32 0, i32 poison, i32 0, <2 x float> %{{.*}})
-; SHADERTEST-DAG: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.input.import.interpolated__v4f32(i1 false, i32 7, i32 0, i32 0, i32 poison, i32 0, <2 x float> %{{.*}})
+; SHADERTEST-COUNT-5: call reassoc nnan nsz arcp contract afn <2 x float> (...) @lgc.eval.Ij.offset.smooth__v2f32(<2 x float> zeroinitializer)
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdx1DStructArray.frag b/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdx1DStructArray.frag
index 6dab9318b3..bb16655cd6 100644
--- a/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdx1DStructArray.frag
+++ b/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdx1DStructArray.frag
@@ -25,11 +25,7 @@ void main()
 // BEGIN_SHADERTEST
 /*
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
-; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
-; SHADERTEST: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> @interpolateAtOffset.v4f32.p64.v2f32(ptr addrspace(64) %{{.*}}, <2 x float> {{.*}})
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline before-patching results
-; SHADERTEST: = call reassoc nnan nsz arcp contract afn <3 x float> @lgc.input.import.builtin.InterpPullMode
-; SHADERTEST-COUNT-12: = call i32 @llvm.amdgcn.mov.dpp.i32(i32
 ; SHADERTEST-DAG: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.input.import.interpolated__v4f32(i1 false, i32 3, i32 0, i32 0, i32 poison, i32 0, <2 x float> %{{.*}})
 ; SHADERTEST-DAG: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.input.import.interpolated__v4f32(i1 false, i32 7, i32 0, i32 0, i32 poison, i32 0, <2 x float> %{{.*}})
 ; SHADERTEST-DAG: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.input.import.interpolated__v4f32(i1 false, i32 11, i32 0, i32 0, i32 poison, i32 0, <2 x float> %{{.*}})
diff --git a/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdx2DArrayInStruct.frag b/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdx2DArrayInStruct.frag
index faf43e8552..4981f61cb2 100644
--- a/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdx2DArrayInStruct.frag
+++ b/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdx2DArrayInStruct.frag
@@ -22,16 +22,12 @@ void main()
 
 }
 
-
-
 // BEGIN_SHADERTEST
 /*
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
 ; SHADERTEST: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> @interpolateAtOffset.v4f32.p64.v2f32(ptr addrspace(64) %{{.*}}, <2 x float> {{.*}})
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline before-patching results
-; SHADERTEST: = call reassoc nnan nsz arcp contract afn <3 x float> @lgc.input.import.builtin.InterpPullMode
-; SHADERTEST-COUNT-12: = call i32 @llvm.amdgcn.mov.dpp.i32(i32
 ; SHADERTEST: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.input.import.interpolated__v4f32(i1 false, i32 2, i32 0, i32 0, i32 poison, i32 0, <2 x float> %{{.*}})
 ; SHADERTEST: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.input.import.interpolated__v4f32(i1 false, i32 3, i32 0, i32 0, i32 poison, i32 0, <2 x float> %{{.*}})
 ; SHADERTEST: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.input.import.interpolated__v4f32(i1 false, i32 4, i32 0, i32 0, i32 poison, i32 0, <2 x float> %{{.*}})
diff --git a/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdx2DArrayInStructInArray.frag b/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdx2DArrayInStructInArray.frag
index 0e495c5865..64a04043dd 100644
--- a/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdx2DArrayInStructInArray.frag
+++ b/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdx2DArrayInStructInArray.frag
@@ -23,14 +23,13 @@ void main()
 {
     frag_color = interpolateAtOffset(interp[x].s2.array[y][z], vec2(0));
 }
+
 // BEGIN_SHADERTEST
 /*
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
 ; SHADERTEST: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> @interpolateAtOffset.v4f32.p64.v2f32(ptr addrspace(64) %{{.*}}, <2 x float> {{.*}})
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline before-patching results
-; SHADERTEST: = call reassoc nnan nsz arcp contract afn <3 x float> @lgc.input.import.builtin.InterpPullMode
-; SHADERTEST-COUNT-12: = call i32 @llvm.amdgcn.mov.dpp.i32(i32
 ; SHADERTEST: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.input.import.interpolated__v4f32(i1 false, i32 12, i32 0, i32 0, i32 poison, i32 0, <2 x float> %{{.*}})
 ; SHADERTEST: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.input.import.interpolated__v4f32(i1 false, i32 13, i32 0, i32 0, i32 poison, i32 0, <2 x float> %{{.*}})
 ; SHADERTEST: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.input.import.interpolated__v4f32(i1 false, i32 14, i32 0, i32 0, i32 poison, i32 0, <2 x float> %{{.*}})
diff --git a/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdx2DStructArray.frag b/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdx2DStructArray.frag
index 5086ab3fab..5cad2538f1 100644
--- a/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdx2DStructArray.frag
+++ b/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdx2DStructArray.frag
@@ -27,8 +27,6 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
 ; SHADERTEST: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> @interpolateAtOffset.v4f32.p64.v2f32(ptr addrspace(64) %{{.*}}, <2 x float> {{.*}})
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline before-patching results
-; SHADERTEST: = call reassoc nnan nsz arcp contract afn <3 x float> @lgc.input.import.builtin.InterpPullMode
-; SHADERTEST-COUNT-12: = call i32 @llvm.amdgcn.mov.dpp.i32(i32
 ; SHADERTEST: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.input.import.interpolated__v4f32(i1 false, i32 2, i32 0, i32 0, i32 poison, i32 0, <2 x float> %{{.*}})
 ; SHADERTEST: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.input.import.interpolated__v4f32(i1 false, i32 5, i32 0, i32 0, i32 poison, i32 0, <2 x float> %{{.*}})
 ; SHADERTEST: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.input.import.interpolated__v4f32(i1 false, i32 8, i32 0, i32 0, i32 poison, i32 0, <2 x float> %{{.*}})
diff --git a/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdx3DArray.frag b/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdx3DArray.frag
index 49231000d4..f900c0c57a 100644
--- a/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdx3DArray.frag
+++ b/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdx3DArray.frag
@@ -11,15 +11,11 @@ void main()
     frag_color = interpolateAtOffset(interp[x][y][z], vec2(0));
 }
 
-
-
 // BEGIN_SHADERTEST
 /*
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> @interpolateAtOffset.v4f32.p64.v2f32(ptr addrspace(64) %{{.*}}, <2 x float> {{.*}})
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline before-patching results
-; SHADERTEST: = call reassoc nnan nsz arcp contract afn <3 x float> @lgc.input.import.builtin.InterpPullMode
-; SHADERTEST-COUNT-12: = call i32 @llvm.amdgcn.mov.dpp.i32(i32
 ; SHADERTEST: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.input.import.interpolated__v4f32(i1 false, i32 3, i32 0, i32 0, i32 poison, i32 0, <2 x float> %{{.*}})
 ; SHADERTEST: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.input.import.interpolated__v4f32(i1 false, i32 4, i32 0, i32 0, i32 poison, i32 0, <2 x float> %{{.*}})
 ; SHADERTEST: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.input.import.interpolated__v4f32(i1 false, i32 5, i32 0, i32 0, i32 poison, i32 0, <2 x float> %{{.*}})
diff --git a/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdxVector.frag b/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdxVector.frag
index d762aaaf46..4133e86480 100644
--- a/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdxVector.frag
+++ b/llpc/test/shaderdb/extensions/OpExtInst_TestInterpolateDynIdxVector.frag
@@ -12,24 +12,17 @@ void main()
     frag_color.x = interpolateAtSample(interp[component], gl_SampleID);
     frag_color.y = interpolateAtSample(interp2[component][0], gl_SampleID);
 }
+
 // BEGIN_SHADERTEST
 /*
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
 ; SHADERTEST-COUNT-2: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn float @interpolateAtSample.f32.p64.i32(ptr addrspace(64) %{{.*}}, i32 %{{.*}})
+
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline before-patching results
-; SHADERTEST-DAG: = call reassoc nnan nsz arcp contract afn <2 x float> @lgc.input.import.builtin.SamplePosOffset.v2f32.i32.i32(
-; SHADERTEST-DAG: = call reassoc nnan nsz arcp contract afn <3 x float> @lgc.input.import.builtin.InterpPullMode.v3f32.i32(
-; SHADERTEST-COUNT-12: = call i32 @llvm.amdgcn.mov.dpp.i32(i32
-; SHADERTEST-DAG: = call reassoc nnan nsz arcp contract afn <2 x float> (...) @lgc.input.import.interpolated__v2f32(i1 false, i32 0, i32 0, i32 0, i32 poison, i32 0,
-; SHADERTEST-DAG: = call reassoc nnan nsz arcp contract afn <2 x float> @lgc.input.import.builtin.SamplePosOffset.v2f32.i32.i32(
-; SHADERTEST-DAG: = call reassoc nnan nsz arcp contract afn <3 x float> @lgc.input.import.builtin.InterpPullMode.v3f32.i32(
-; SHADERTEST-COUNT-12: = call i32 @llvm.amdgcn.mov.dpp.i32(i32
-; SHADERTEST-DAG: = call reassoc nnan nsz arcp contract afn float (...) @lgc.input.import.interpolated__f32(i1 false, i32 1, i32 0, i32 0, i32 poison, i32 0, <2 x float>
-; SHADERTEST-DAG: = call reassoc nnan nsz arcp contract afn <2 x float> @lgc.input.import.builtin.SamplePosOffset.v2f32.i32.i32(
-; SHADERTEST-DAG: = call reassoc nnan nsz arcp contract afn <3 x float> @lgc.input.import.builtin.InterpPullMode.v3f32.i32(
-; SHADERTEST-COUNT-12: = call i32 @llvm.amdgcn.mov.dpp.i32(i32
-; SHADERTEST-DAG: = call reassoc nnan nsz arcp contract afn float (...) @lgc.input.import.interpolated__f32(i1 false, i32 2, i32 0, i32 0, i32 poison, i32 0, <2 x float>
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <2 x float> (...) @lgc.input.import.interpolated__v2f32(i1 false, i32 0
+; SHADERTEST: call reassoc nnan nsz arcp contract afn float (...) @lgc.input.import.interpolated__f32(i1 false, i32 1
+: SHADERTEST: call reassoc nnan nsz arcp contract afn float (...) @lgc.input.import.interpolated__f32(i1 false, i32 2
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/extensions/OpExtInst_TestUnpackHalf2x16_lit.frag b/llpc/test/shaderdb/extensions/OpExtInst_TestUnpackHalf2x16_lit.frag
index 6890e34d92..03a3156de0 100644
--- a/llpc/test/shaderdb/extensions/OpExtInst_TestUnpackHalf2x16_lit.frag
+++ b/llpc/test/shaderdb/extensions/OpExtInst_TestUnpackHalf2x16_lit.frag
@@ -18,7 +18,7 @@ void main()
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
 ; SHADERTEST: %[[BITCAST:.*]] = bitcast i32 %{{.*}} to <2 x half>
-; SHADERTEST: = fpext <2 x half> %[[BITCAST]] to <2 x float>
+; SHADERTEST: = fpext {{.*}}<2 x half> %[[BITCAST]] to <2 x float>
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/general/CallInstAsUserOfGlobalVariable.spvasm b/llpc/test/shaderdb/general/CallInstAsUserOfGlobalVariable.spvasm
index 004ecc957d..e75993c573 100644
--- a/llpc/test/shaderdb/general/CallInstAsUserOfGlobalVariable.spvasm
+++ b/llpc/test/shaderdb/general/CallInstAsUserOfGlobalVariable.spvasm
@@ -1,6 +1,6 @@
 ; This test checks if lowerGlobal is handling properly case with removed zero-index GEPs.
 
-; @_ug_input23 = external addrspace(7) global [2 x <{ [4294967295 x float] }>], !spirv.Resource !2, !spirv.Block !1
+; @_ug_input23 = external addrspace(7) global [2 x <{ [0 x float] }>], !spirv.Resource !2, !spirv.Block !1
 ; %2 = call i32 @lgc.buffer.length(ptr addrspace(7) @_ug_input23, i32 0)
 
 
@@ -8,7 +8,7 @@
 ; RUN: amdllpc -v -gfxip=11.0 %s | FileCheck %s
 
 ; CHECK-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
-; CHECK: @_ug_input23 = external addrspace(7) global [2 x <{ [4294967295 x float] }>], !spirv.Resource !2, !spirv.Block !1
+; CHECK: @_ug_input23 = external addrspace(7) global [2 x <{ [0 x float] }>], !spirv.Resource !2, !spirv.Block !1
 ; CHECK: call i32 @lgc.buffer.length(ptr addrspace(7) @_ug_input23, i32 0)
 
 ; CHECK-LABEL: {{^// LLPC}}  SPIR-V lowering results
diff --git a/llpc/test/shaderdb/general/ImgDescLoad.comp b/llpc/test/shaderdb/general/ImgDescLoad.comp
index f0f462d1fb..a4d63080a5 100644
--- a/llpc/test/shaderdb/general/ImgDescLoad.comp
+++ b/llpc/test/shaderdb/general/ImgDescLoad.comp
@@ -5,8 +5,8 @@
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline before-patching results
-; SHADERTEST: [[IMG_DESC:%[0-9]*]] = load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}, align 4, !invariant.load !12
-; SHADERTEST: [[SMP_DESC:%[0-9]*]] = load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}, align 4, !invariant.load !12
+; SHADERTEST: [[IMG_DESC:%[0-9]*]] = load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}, align 4, !invariant.load !{{.*}}
+; SHADERTEST: [[SMP_DESC:%[0-9]*]] = load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}, align 4, !invariant.load !{{.*}}
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[IMG_DESC]], <4 x i32> [[SMP_DESC]], i1 false, i32 0, i32 0)
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/general/PipelineCs_ForceMemoryBarrierScope.pipe b/llpc/test/shaderdb/general/PipelineCs_ForceMemoryBarrierScope.pipe
new file mode 100644
index 0000000000..7f45c7ff25
--- /dev/null
+++ b/llpc/test/shaderdb/general/PipelineCs_ForceMemoryBarrierScope.pipe
@@ -0,0 +1,23 @@
+; BEGIN_SHADERTEST
+; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
+; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
+; SHADERTEST: fence syncscope("workgroup") acq_rel
+; SHADERTEST: AMDLLPC SUCCESS
+; END_SHADERTEST
+
+; Check that forceMemoryBarrierScope changes synscope from default "agent" (SPIR-V Device scope) to "workgroup" (SPIR-V Workgroup scope).
+
+[CsGlsl]
+#version 450
+
+layout(local_size_x = 16, local_size_y = 16) in;
+
+void main()
+{
+    memoryBarrier();
+}
+
+
+[CsInfo]
+entryPoint = main
+options.forceMemoryBarrierScope = 2
diff --git a/llpc/test/shaderdb/general/PipelineCs_LdsSpillLimitDwordsOption.pipe b/llpc/test/shaderdb/general/PipelineCs_LdsSpillLimitDwordsOption.pipe
index 04ea8b0b42..94d7ce2791 100644
--- a/llpc/test/shaderdb/general/PipelineCs_LdsSpillLimitDwordsOption.pipe
+++ b/llpc/test/shaderdb/general/PipelineCs_LdsSpillLimitDwordsOption.pipe
@@ -22,4 +22,4 @@ options.ldsSpillLimitDwords = 1024
 ; CHECK: attributes #[[ATTR0:[0-9]+]] = { alwaysinline nounwind memory(readwrite) "amdgpu-flat-work-group-size"="66,66" "amdgpu-lds-spill-limit-dwords"="1024" "amdgpu-memory-bound"="false" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-prealloc-sgpr-spill-vgprs" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "amdgpu-work-group-info-arg-no"="1" "denormal-fp-math-f32"="preserve-sign" "target-features"=",+wavefrontsize64,+cumode,+enable-flat-scratch" }
 ;.
 ; CHECK: [[META0:![0-9]+]] = !{i32 2, i32 3, i32 11}
-; CHECK: [[META1:![0-9]+]] = !{!"Vulkan"}
\ No newline at end of file
+; CHECK: [[META1:![0-9]+]] = !{!"Vulkan"}
diff --git a/llpc/test/shaderdb/general/PipelineCs_MultipleRootInlineBuffer.pipe b/llpc/test/shaderdb/general/PipelineCs_MultipleRootInlineBuffer.pipe
index 20766fa0ab..dea3a8a240 100644
--- a/llpc/test/shaderdb/general/PipelineCs_MultipleRootInlineBuffer.pipe
+++ b/llpc/test/shaderdb/general/PipelineCs_MultipleRootInlineBuffer.pipe
@@ -15,7 +15,7 @@
 ; SHADERTEST: [[buf1:%[0-9]*]] = ptrtoint ptr addrspace(4) [[buf_addr1]] to i64
 
 ; Get the "fat pointer" for the buffer
-; SHADERTEST: call ptr addrspace(7) @lgc.buffer.addr.to.ptr(i64 [[buf1]])
+; SHADERTEST: call ptr addrspace(7) @lgc.buffer.addr.to.ptr(i64 [[buf1]], i1 false)
 
 ; Get a pointer to the first inline buffer. Offset 4 comes from the user data nodes
 ; SHADERTEST: [[buf_addr0:%[0-9]*]] = call ptr addrspace(4) @lgc.user.data(i32 4)
diff --git a/llpc/test/shaderdb/general/PipelineTcsTes_TestLocMapLoadBuiltInOutput.pipe b/llpc/test/shaderdb/general/PipelineTcsTes_TestLocMapLoadBuiltInOutput.pipe
index fc1b55ac82..f39d6a99b7 100644
--- a/llpc/test/shaderdb/general/PipelineTcsTes_TestLocMapLoadBuiltInOutput.pipe
+++ b/llpc/test/shaderdb/general/PipelineTcsTes_TestLocMapLoadBuiltInOutput.pipe
@@ -1,13 +1,10 @@
 ; BEGIN_SHADERTEST
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
-; SHADERTEST-LABEL: {{^// LLPC}} tessellation calculation factor results
-; SHADERTEST: Patch constant count: 0
-; SHADERTEST: Patch constant size (in dwords): 0
-; SHADERTEST: Patch constant total size (in dwords): 0
+; SHADERTEST-LABEL: {{^// LLPC}} HW tessellation configurations
+; SHADERTEST: PatchConstants = 0, Size = [0, 0] dwords
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call void @llvm.amdgcn.raw.tbuffer.store.v4f32
-; SHADERTEST-NEXT: br label %.endHs
+; SHADERTEST: call void @llvm.amdgcn.raw.buffer.store.v4f32
 ; SHADERTEST: AMDLLPC SUCCESS
 ; END_SHADERTEST
 
diff --git a/llpc/test/shaderdb/general/PipelineTess_TestInOutPacking.pipe b/llpc/test/shaderdb/general/PipelineTess_TestInOutPacking.pipe
index be9989bb8e..19f2530807 100644
--- a/llpc/test/shaderdb/general/PipelineTess_TestInOutPacking.pipe
+++ b/llpc/test/shaderdb/general/PipelineTess_TestInOutPacking.pipe
@@ -2,35 +2,35 @@
 ; RUN: amdllpc -enable-part-pipeline=0 -v %gfxip %s | FileCheck -check-prefix=SHADERTEST_PP0 %s
 ; SHADERTEST_PP0-LABEL: {{^// LLPC}} pipeline patching results
 
-; SHADERTEST_PP0: [[VERTEX_BASE:%[0-9a-zA-Z.]+]] = mul i32 %{{[0-9]*}}, 192
+; SHADERTEST_PP0: [[VERTEX_BASE:%[0-9a-zA-Z.]+]] = mul i32 %{{[0-9]*}}, 196
 ; SHADERTEST_PP0: [[P0:%[0-9a-zA-Z.]+]] = getelementptr {{i8|i32}}, ptr addrspace(3) {{.*}}, i32 [[VERTEX_BASE]]
-; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1536
-; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1540
-; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1552
-; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1556
-; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1568
-; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1572
-; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1576
-; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1584
-; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1600
-; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1616
-; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1632
-; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1648
-; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1652
-; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1656
-; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1660
-; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1664
-; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1668
-; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1680
-; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1684
-; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1688
-; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1692
-; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1696
-; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1700
-; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1712
-; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1716
-; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1720
-; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1724
+; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5120
+; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5124
+; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5136
+; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5140
+; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5152
+; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5156
+; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5160
+; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5168
+; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5184
+; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5200
+; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5216
+; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5232
+; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5236
+; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5240
+; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5244
+; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5248
+; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5252
+; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5264
+; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5268
+; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5272
+; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5276
+; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5280
+; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5284
+; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5296
+; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5300
+; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5304
+; SHADERTEST_PP0: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5308
 ; SHADERTEST_PP0: call void @llvm.amdgcn.exp.f32(i32 {{.*}}32, i32 {{.*}}15, float %{{[^,]*}}, float %{{[^,]*}}, float %{{[^,]*}}, float %{{[^,]*}}, i1 {{.*}}false, i1 {{.*}}false)
 ; SHADERTEST_PP0: call void @llvm.amdgcn.exp.f32(i32 {{.*}}33, i32 {{.*}}3, float %{{[^,]*}}, float %{{[^,]*}}, float poison, float poison, i1 {{.*}}false, i1 {{.*}}false)
 ; SHADERTEST_PP0: call float @llvm.amdgcn.interp.p1(float %{{[^,]*}}, i32 1, i32 1, i32 %PrimMask)
@@ -54,35 +54,35 @@
 ; SHADERTEST_PP1: call float @llvm.amdgcn.interp.p1(float %{{[^,]*}}, i32 1, i32 0, i32 %PrimMask)
 ; Pre-rasterization part-pipeline:
 ; SHADERTEST_PP1-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST_PP1: [[VERTEX_BASE:%[0-9a-zA-Z.]+]] = mul i32 %{{[0-9]*}}, 192
+; SHADERTEST_PP1: [[VERTEX_BASE:%[0-9a-zA-Z.]+]] = mul i32 %{{[0-9]*}}, 196
 ; SHADERTEST_PP1: [[P0:%[0-9a-zA-Z.]+]] = getelementptr i8, ptr addrspace(3) {{.*}}, i32 [[VERTEX_BASE]]
-; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1536
-; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1540
-; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1552
-; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1556
-; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1568
-; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1572
-; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1576
-; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1584
-; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1600
-; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1616
-; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1632
-; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1648
-; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1652
-; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1656
-; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1660
-; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1664
-; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1668
-; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1680
-; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1684
-; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1688
-; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1692
-; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1696
-; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1700
-; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1712
-; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1716
-; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1720
-; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 1724
+; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5120
+; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5124
+; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5136
+; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5140
+; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5152
+; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5156
+; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5160
+; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5168
+; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5184
+; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5200
+; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5216
+; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5232
+; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5236
+; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5240
+; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5244
+; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5248
+; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5252
+; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5264
+; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5268
+; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5272
+; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5276
+; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5280
+; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5284
+; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5296
+; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5300
+; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5304
+; SHADERTEST_PP1: %{{[0-9]*}} = getelementptr {{i8}}, ptr addrspace(3) [[P0]], i32 5308
 ; SHADERTEST_PP1: call void @llvm.amdgcn.exp.f32(i32 {{.*}}32, i32 {{.*}}15, float %{{[^,]*}}, float %{{[^,]*}}, float %{{[^,]*}}, float %{{[^,]*}}, i1 {{.*}}false, i1 {{.*}}false)
 ; SHADERTEST_PP1: call void @llvm.amdgcn.exp.f32(i32 {{.*}}33, i32 {{.*}}3, float %{{[^,]*}}, float %{{[^,]*}}, float poison, float poison, i1 {{.*}}false, i1 {{.*}}false)
 ; SHADERTEST_PP1: AMDLLPC SUCCESS
diff --git a/llpc/test/shaderdb/general/PipelineVsFs_DynamicSampleInfo.pipe b/llpc/test/shaderdb/general/PipelineVsFs_DynamicSampleInfo.pipe
index 53d387a973..b01ed348fa 100644
--- a/llpc/test/shaderdb/general/PipelineVsFs_DynamicSampleInfo.pipe
+++ b/llpc/test/shaderdb/general/PipelineVsFs_DynamicSampleInfo.pipe
@@ -77,7 +77,7 @@ attribute[1].binding = 0
 attribute[1].format = VK_FORMAT_R32G32_SFLOAT
 attribute[1].offset = 16
 ; SHADERTEST-LABEL: define {{[^@]+}}@lgc.shader.VS.main
-; SHADERTEST-SAME: (i32 inreg noundef [[GLOBALTABLE:%.*]], i32 inreg noundef [[VERTEXBUFFERTABLE:%.*]], i32 inreg noundef [[BASEVERTEX:%.*]], i32 inreg noundef [[BASEINSTANCE:%.*]], i32 noundef [[VERTEXID:%.*]], i32 noundef [[RELVERTEXID:%.*]], i32 noundef [[PRIMITIVEID:%.*]], i32 noundef [[INSTANCEID:%.*]]) #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !12 !lgc.shaderstage !13 {
+; SHADERTEST-SAME: (i32 inreg noundef [[GLOBALTABLE:%.*]], i32 inreg noundef [[VERTEXBUFFERTABLE:%.*]], i32 inreg noundef [[BASEVERTEX:%.*]], i32 inreg noundef [[BASEINSTANCE:%.*]], i32 noundef [[VERTEXID:%.*]], i32 noundef [[RELVERTEXID:%.*]], i32 noundef [[PRIMITIVEID:%.*]], i32 noundef [[INSTANCEID:%.*]]) #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !13 !lgc.shaderstage !14 {
 ; SHADERTEST-NEXT:  .entry:
 ; SHADERTEST-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
 ; SHADERTEST-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -86,7 +86,7 @@ attribute[1].offset = 16
 ; SHADERTEST-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
 ; SHADERTEST-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(4)
 ; SHADERTEST-NEXT:    [[TMP5:%.*]] = getelementptr <4 x i32>, ptr addrspace(4) [[TMP4]], i64 0
-; SHADERTEST-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP5]], align 16, !invariant.load !14
+; SHADERTEST-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP5]], align 16, !invariant.load !15
 ; SHADERTEST-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.struct.tbuffer.load.i32{{(\.v4i32)?}}(<4 x i32> [[TMP6]], i32 [[VERTEXINDEX]], i32 16, i32 0, i32 22, i32 0)
 ; SHADERTEST-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i64 0
 ; SHADERTEST-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.struct.tbuffer.load.i32{{(\.v4i32)?}}(<4 x i32> [[TMP6]], i32 [[VERTEXINDEX]], i32 20, i32 0, i32 22, i32 0)
@@ -97,7 +97,7 @@ attribute[1].offset = 16
 ; SHADERTEST-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP13]], i32 1
 ; SHADERTEST-NEXT:    [[VERTEX1_0:%.*]] = bitcast <2 x i32> [[TMP14]] to <2 x float>
 ; SHADERTEST-NEXT:    [[TMP15:%.*]] = getelementptr <4 x i32>, ptr addrspace(4) [[TMP4]], i64 0
-; SHADERTEST-NEXT:    [[TMP16:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP15]], align 16, !invariant.load !14
+; SHADERTEST-NEXT:    [[TMP16:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP15]], align 16, !invariant.load !15
 ; SHADERTEST-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.struct.tbuffer.load.i32{{(\.v4i32)?}}(<4 x i32> [[TMP16]], i32 [[VERTEXINDEX]], i32 0, i32 0, i32 22, i32 0)
 ; SHADERTEST-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> poison, i32 [[TMP17]], i64 0
 ; SHADERTEST-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.struct.tbuffer.load.i32{{(\.v4i32)?}}(<4 x i32> [[TMP16]], i32 [[VERTEXINDEX]], i32 4, i32 0, i32 22, i32 0)
@@ -123,119 +123,49 @@ attribute[1].offset = 16
 ; SHADERTEST-NEXT:    [[TMP38:%.*]] = insertelement <4 x float> [[TMP37]], float [[TMP34]], i64 1
 ; SHADERTEST-NEXT:    [[TMP39:%.*]] = insertelement <4 x float> [[TMP38]], float [[TMP35]], i64 2
 ; SHADERTEST-NEXT:    [[TMP40:%.*]] = insertelement <4 x float> [[TMP39]], float [[TMP36]], i64 3
-; SHADERTEST-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[TMP40]]) #[[ATTR7:[0-9]+]]
-; SHADERTEST-NEXT:    call void @lgc.output.export.builtin.Position.i32.v4f32(i32 0, <4 x float> [[VERTEX0_0]]) #[[ATTR7]]
+; SHADERTEST-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[TMP40]]) #[[ATTR4:[0-9]+]]
+; SHADERTEST-NEXT:    call void @lgc.output.export.builtin.Position.i32.v4f32(i32 0, <4 x float> [[VERTEX0_0]]) #[[ATTR4]]
 ; SHADERTEST-NEXT:    ret void
 ;
 ;
 ; SHADERTEST-LABEL: define {{[^@]+}}@lgc.shader.FS.main
-; SHADERTEST-SAME: (i32 inreg noundef [[GLOBALTABLE:%.*]], i32 inreg noundef [[COMPOSITEDATA:%.*]], i32 inreg noundef [[PRIMMASK:%.*]], <2 x float> noundef [[PERSPINTERPSAMPLE:%.*]], <2 x float> noundef [[PERSPINTERPCENTER:%.*]], <2 x float> noundef [[PERSPINTERPCENTROID:%.*]], <3 x float> noundef [[PERSPINTERPPULLMODE:%.*]], <2 x float> noundef [[LINEARINTERPSAMPLE:%.*]], <2 x float> noundef [[LINEARINTERPCENTER:%.*]], <2 x float> noundef [[LINEARINTERPCENTROID:%.*]], float noundef [[LINESTIPPLE:%.*]], float noundef [[FRAGCOORDX:%.*]], float noundef [[FRAGCOORDY:%.*]], float noundef [[FRAGCOORDZ:%.*]], float noundef [[FRAGCOORDW:%.*]], i32 noundef [[FRONTFACING:%.*]], i32 noundef [[ANCILLARY:%.*]], i32 noundef [[SAMPLECOVERAGE:%.*]], i32 noundef [[FIXEDXY:%.*]]) #[[ATTR1:[0-9]+]] !spirv.ExecutionModel !15 !lgc.shaderstage !16 {
+; SHADERTEST-SAME: (i32 inreg noundef [[GLOBALTABLE:%.*]], i32 inreg noundef [[COMPOSITEDATA:%.*]], i32 inreg noundef [[PRIMMASK:%.*]], <2 x float> noundef [[PERSPINTERPSAMPLE:%.*]], <2 x float> noundef [[PERSPINTERPCENTER:%.*]], <2 x float> noundef [[PERSPINTERPCENTROID:%.*]], <3 x float> noundef [[PERSPINTERPPULLMODE:%.*]], <2 x float> noundef [[LINEARINTERPSAMPLE:%.*]], <2 x float> noundef [[LINEARINTERPCENTER:%.*]], <2 x float> noundef [[LINEARINTERPCENTROID:%.*]], float noundef [[LINESTIPPLE:%.*]], float noundef [[FRAGCOORDX:%.*]], float noundef [[FRAGCOORDY:%.*]], float noundef [[FRAGCOORDZ:%.*]], float noundef [[FRAGCOORDW:%.*]], i32 noundef [[FRONTFACING:%.*]], i32 noundef [[ANCILLARY:%.*]], i32 noundef [[SAMPLECOVERAGE:%.*]], i32 noundef [[FIXEDXY:%.*]]) #[[ATTR2:[0-9]+]] !spirv.ExecutionModel !16 !lgc.shaderstage !17 {
 ; SHADERTEST-NEXT:  .entry:
-; SHADERTEST-NEXT:    [[INTERPPERSPSAMPLE:%.*]] = call <2 x float> @lgc.input.import.builtin.InterpPerspSample.v2f32.i32(i32 268435456) #[[ATTR3:[0-9]+]]
+; SHADERTEST-NEXT:    [[INTERPPERSPSAMPLE:%.*]] = call <2 x float> @lgc.input.import.builtin.InterpPerspSample.v2f32.i32(i32 268435456) #[[ATTR5:[0-9]+]]
 ; SHADERTEST-NEXT:    [[TMP0:%.*]] = call float (...) @lgc.input.import.interpolated__f32(i1 false, i32 1, i32 0, i32 0, i32 poison, i32 0, <2 x float> [[INTERPPERSPSAMPLE]])
 ; SHADERTEST-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i64 0
 ; SHADERTEST-NEXT:    [[TMP2:%.*]] = call float (...) @lgc.input.import.interpolated__f32(i1 false, i32 1, i32 0, i32 1, i32 poison, i32 0, <2 x float> [[INTERPPERSPSAMPLE]])
 ; SHADERTEST-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP1]], float [[TMP2]], i64 1
-; SHADERTEST-NEXT:    [[SAMPLEPOSITION:%.*]] = call <2 x float> @lgc.input.import.builtin.SamplePosition.v2f32.i32(i32 19) #[[ATTR3]]
+; SHADERTEST-NEXT:    [[SAMPLEPOSITION:%.*]] = call <2 x float> @lgc.input.import.builtin.SamplePosition.v2f32.i32(i32 19) #[[ATTR5]]
 ; SHADERTEST-NEXT:    [[TMP4:%.*]] = fadd reassoc nnan nsz arcp contract afn <2 x float> [[SAMPLEPOSITION]], {{(splat \(float \-5\.000000e\-01\))|(<float \-5\.000000e\-01, float \-5\.000000e\-01>)}}
-; SHADERTEST-NEXT:    [[INTERPPULLMODE:%.*]] = call reassoc nnan nsz arcp contract afn <3 x float> @lgc.input.import.builtin.InterpPullMode.v3f32.i32(i32 268435459) #[[ATTR3]]
-; SHADERTEST-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i64 0
-; SHADERTEST-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i64 1
-; SHADERTEST-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <3 x float> poison, float [[TMP5]], i64 0
-; SHADERTEST-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <3 x float> [[DOTSPLATINSERT]], <3 x float> poison, <3 x i32> zeroinitializer
-; SHADERTEST-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <3 x float> poison, float [[TMP6]], i64 0
-; SHADERTEST-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <3 x float> [[DOTSPLATINSERT1]], <3 x float> poison, <3 x i32> zeroinitializer
-; SHADERTEST-NEXT:    [[TMP7:%.*]] = extractelement <3 x float> [[INTERPPULLMODE]], i64 0
-; SHADERTEST-NEXT:    [[TMP8:%.*]] = bitcast float [[TMP7]] to i32
-; SHADERTEST-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[TMP8]], i32 245, i32 15, i32 15, i1 true)
-; SHADERTEST-NEXT:    [[TMP10:%.*]] = bitcast i32 [[TMP9]] to float
-; SHADERTEST-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[TMP8]], i32 160, i32 15, i32 15, i1 true)
-; SHADERTEST-NEXT:    [[TMP12:%.*]] = bitcast i32 [[TMP11]] to float
-; SHADERTEST-NEXT:    [[TMP13:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[TMP10]], [[TMP12]]
-; SHADERTEST-NEXT:    [[TMP14:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.wqm.f32(float [[TMP13]])
-; SHADERTEST-NEXT:    [[TMP15:%.*]] = insertelement <3 x float> poison, float [[TMP14]], i64 0
-; SHADERTEST-NEXT:    [[TMP16:%.*]] = extractelement <3 x float> [[INTERPPULLMODE]], i64 1
-; SHADERTEST-NEXT:    [[TMP17:%.*]] = bitcast float [[TMP16]] to i32
-; SHADERTEST-NEXT:    [[TMP18:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[TMP17]], i32 245, i32 15, i32 15, i1 true)
-; SHADERTEST-NEXT:    [[TMP19:%.*]] = bitcast i32 [[TMP18]] to float
-; SHADERTEST-NEXT:    [[TMP20:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[TMP17]], i32 160, i32 15, i32 15, i1 true)
-; SHADERTEST-NEXT:    [[TMP21:%.*]] = bitcast i32 [[TMP20]] to float
-; SHADERTEST-NEXT:    [[TMP22:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[TMP19]], [[TMP21]]
-; SHADERTEST-NEXT:    [[TMP23:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.wqm.f32(float [[TMP22]])
-; SHADERTEST-NEXT:    [[TMP24:%.*]] = insertelement <3 x float> [[TMP15]], float [[TMP23]], i64 1
-; SHADERTEST-NEXT:    [[TMP25:%.*]] = extractelement <3 x float> [[INTERPPULLMODE]], i64 2
-; SHADERTEST-NEXT:    [[TMP26:%.*]] = bitcast float [[TMP25]] to i32
-; SHADERTEST-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[TMP26]], i32 245, i32 15, i32 15, i1 true)
-; SHADERTEST-NEXT:    [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float
-; SHADERTEST-NEXT:    [[TMP29:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[TMP26]], i32 160, i32 15, i32 15, i1 true)
-; SHADERTEST-NEXT:    [[TMP30:%.*]] = bitcast i32 [[TMP29]] to float
-; SHADERTEST-NEXT:    [[TMP31:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[TMP28]], [[TMP30]]
-; SHADERTEST-NEXT:    [[TMP32:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.wqm.f32(float [[TMP31]])
-; SHADERTEST-NEXT:    [[TMP33:%.*]] = insertelement <3 x float> [[TMP24]], float [[TMP32]], i64 2
-; SHADERTEST-NEXT:    [[TMP34:%.*]] = extractelement <3 x float> [[INTERPPULLMODE]], i64 0
-; SHADERTEST-NEXT:    [[TMP35:%.*]] = bitcast float [[TMP34]] to i32
-; SHADERTEST-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[TMP35]], i32 238, i32 15, i32 15, i1 true)
-; SHADERTEST-NEXT:    [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
-; SHADERTEST-NEXT:    [[TMP38:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[TMP35]], i32 68, i32 15, i32 15, i1 true)
-; SHADERTEST-NEXT:    [[TMP39:%.*]] = bitcast i32 [[TMP38]] to float
-; SHADERTEST-NEXT:    [[TMP40:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[TMP37]], [[TMP39]]
-; SHADERTEST-NEXT:    [[TMP41:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.wqm.f32(float [[TMP40]])
-; SHADERTEST-NEXT:    [[TMP42:%.*]] = insertelement <3 x float> poison, float [[TMP41]], i64 0
-; SHADERTEST-NEXT:    [[TMP43:%.*]] = extractelement <3 x float> [[INTERPPULLMODE]], i64 1
-; SHADERTEST-NEXT:    [[TMP44:%.*]] = bitcast float [[TMP43]] to i32
-; SHADERTEST-NEXT:    [[TMP45:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[TMP44]], i32 238, i32 15, i32 15, i1 true)
-; SHADERTEST-NEXT:    [[TMP46:%.*]] = bitcast i32 [[TMP45]] to float
-; SHADERTEST-NEXT:    [[TMP47:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[TMP44]], i32 68, i32 15, i32 15, i1 true)
-; SHADERTEST-NEXT:    [[TMP48:%.*]] = bitcast i32 [[TMP47]] to float
-; SHADERTEST-NEXT:    [[TMP49:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[TMP46]], [[TMP48]]
-; SHADERTEST-NEXT:    [[TMP50:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.wqm.f32(float [[TMP49]])
-; SHADERTEST-NEXT:    [[TMP51:%.*]] = insertelement <3 x float> [[TMP42]], float [[TMP50]], i64 1
-; SHADERTEST-NEXT:    [[TMP52:%.*]] = extractelement <3 x float> [[INTERPPULLMODE]], i64 2
-; SHADERTEST-NEXT:    [[TMP53:%.*]] = bitcast float [[TMP52]] to i32
-; SHADERTEST-NEXT:    [[TMP54:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[TMP53]], i32 238, i32 15, i32 15, i1 true)
-; SHADERTEST-NEXT:    [[TMP55:%.*]] = bitcast i32 [[TMP54]] to float
-; SHADERTEST-NEXT:    [[TMP56:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[TMP53]], i32 68, i32 15, i32 15, i1 true)
-; SHADERTEST-NEXT:    [[TMP57:%.*]] = bitcast i32 [[TMP56]] to float
-; SHADERTEST-NEXT:    [[TMP58:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[TMP55]], [[TMP57]]
-; SHADERTEST-NEXT:    [[TMP59:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.wqm.f32(float [[TMP58]])
-; SHADERTEST-NEXT:    [[TMP60:%.*]] = insertelement <3 x float> [[TMP51]], float [[TMP59]], i64 2
-; SHADERTEST-NEXT:    [[TMP61:%.*]] = fmul reassoc nnan nsz arcp contract afn <3 x float> [[TMP33]], [[DOTSPLAT]]
-; SHADERTEST-NEXT:    [[TMP62:%.*]] = fadd reassoc nnan nsz arcp contract afn <3 x float> [[INTERPPULLMODE]], [[TMP61]]
-; SHADERTEST-NEXT:    [[TMP63:%.*]] = fmul reassoc nnan nsz arcp contract afn <3 x float> [[TMP60]], [[DOTSPLAT2]]
-; SHADERTEST-NEXT:    [[TMP64:%.*]] = fadd reassoc nnan nsz arcp contract afn <3 x float> [[TMP62]], [[TMP63]]
-; SHADERTEST-NEXT:    [[TMP65:%.*]] = shufflevector <3 x float> [[TMP64]], <3 x float> [[TMP64]], <2 x i32> <i32 0, i32 1>
-; SHADERTEST-NEXT:    [[TMP66:%.*]] = extractelement <3 x float> [[TMP64]], i64 2
-; SHADERTEST-NEXT:    [[TMP67:%.*]] = fdiv reassoc nnan nsz arcp contract afn float 1.000000e+00, [[TMP66]]
-; SHADERTEST-NEXT:    [[DOTSPLATINSERT3:%.*]] = insertelement <2 x float> poison, float [[TMP67]], i64 0
-; SHADERTEST-NEXT:    [[DOTSPLAT4:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT3]], <2 x float> poison, <2 x i32> zeroinitializer
-; SHADERTEST-NEXT:    [[TMP68:%.*]] = fmul reassoc nnan nsz arcp contract afn <2 x float> [[TMP65]], [[DOTSPLAT4]]
-; SHADERTEST-NEXT:    [[TMP69:%.*]] = call float (...) @lgc.input.import.interpolated__f32(i1 false, i32 0, i32 0, i32 0, i32 poison, i32 0, <2 x float> [[TMP68]])
-; SHADERTEST-NEXT:    [[TMP70:%.*]] = insertelement <2 x float> poison, float [[TMP69]], i64 0
-; SHADERTEST-NEXT:    [[TMP71:%.*]] = call float (...) @lgc.input.import.interpolated__f32(i1 false, i32 0, i32 0, i32 1, i32 poison, i32 0, <2 x float> [[TMP68]])
-; SHADERTEST-NEXT:    [[TMP72:%.*]] = insertelement <2 x float> [[TMP70]], float [[TMP71]], i64 1
-; SHADERTEST-NEXT:    [[TMP73:%.*]] = fsub reassoc nnan nsz arcp contract afn <2 x float> [[TMP72]], [[TMP3]]
-; SHADERTEST-NEXT:    [[TMP74:%.*]] = call reassoc nnan nsz arcp contract afn <2 x float> @llvm.fabs.v2f32(<2 x float> [[TMP73]])
-; SHADERTEST-NEXT:    [[TMP75:%.*]] = extractelement <2 x float> [[TMP74]], i64 0
-; SHADERTEST-NEXT:    [[TMP76:%.*]] = fcmp olt float [[TMP75]], 1.562500e-01
-; SHADERTEST-NEXT:    [[TMP77:%.*]] = extractelement <2 x float> [[TMP74]], i64 1
-; SHADERTEST-NEXT:    [[TMP78:%.*]] = fcmp olt float [[TMP77]], 1.562500e-01
-; SHADERTEST-NEXT:    [[TMP79:%.*]] = and i1 [[TMP76]], [[TMP78]]
-; SHADERTEST-NEXT:    [[COND_FREEZE:%.*]] = freeze i1 [[TMP79]]
+; SHADERTEST-NEXT:    [[TMP5:%.*]] = call reassoc nnan nsz arcp contract afn <2 x float> (...) @lgc.eval.Ij.offset.smooth__v2f32(<2 x float> [[TMP4]])
+; SHADERTEST-NEXT:    [[TMP6:%.*]] = call float (...) @lgc.input.import.interpolated__f32(i1 false, i32 0, i32 0, i32 0, i32 poison, i32 0, <2 x float> [[TMP5]])
+; SHADERTEST-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i64 0
+; SHADERTEST-NEXT:    [[TMP8:%.*]] = call float (...) @lgc.input.import.interpolated__f32(i1 false, i32 0, i32 0, i32 1, i32 poison, i32 0, <2 x float> [[TMP5]])
+; SHADERTEST-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> [[TMP7]], float [[TMP8]], i64 1
+; SHADERTEST-NEXT:    [[TMP10:%.*]] = fsub reassoc nnan nsz arcp contract afn <2 x float> [[TMP9]], [[TMP3]]
+; SHADERTEST-NEXT:    [[TMP11:%.*]] = call reassoc nnan nsz arcp contract afn <2 x float> @llvm.fabs.v2f32(<2 x float> [[TMP10]])
+; SHADERTEST-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP11]], i64 0
+; SHADERTEST-NEXT:    [[TMP13:%.*]] = fcmp olt float [[TMP12]], 1.562500e-01
+; SHADERTEST-NEXT:    [[TMP14:%.*]] = extractelement <2 x float> [[TMP11]], i64 1
+; SHADERTEST-NEXT:    [[TMP15:%.*]] = fcmp olt float [[TMP14]], 1.562500e-01
+; SHADERTEST-NEXT:    [[TMP16:%.*]] = and i1 [[TMP13]], [[TMP15]]
+; SHADERTEST-NEXT:    [[COND_FREEZE:%.*]] = freeze i1 [[TMP16]]
 ; SHADERTEST-NEXT:    [[DOT:%.*]] = select i1 [[COND_FREEZE]], <4 x float> <float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 1.000000e+00>, <4 x float> <float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>
-; SHADERTEST-NEXT:    [[TMP80:%.*]] = extractelement <4 x float> [[DOT]], i64 0
-; SHADERTEST-NEXT:    [[TMP81:%.*]] = extractelement <4 x float> [[DOT]], i64 1
-; SHADERTEST-NEXT:    [[TMP82:%.*]] = extractelement <4 x float> [[DOT]], i64 2
-; SHADERTEST-NEXT:    [[TMP83:%.*]] = extractelement <4 x float> [[DOT]], i64 3
-; SHADERTEST-NEXT:    [[TMP84:%.*]] = insertelement <4 x float> poison, float [[TMP80]], i64 0
-; SHADERTEST-NEXT:    [[TMP85:%.*]] = insertelement <4 x float> [[TMP84]], float [[TMP81]], i64 1
-; SHADERTEST-NEXT:    [[TMP86:%.*]] = insertelement <4 x float> [[TMP85]], float [[TMP82]], i64 2
-; SHADERTEST-NEXT:    [[TMP87:%.*]] = insertelement <4 x float> [[TMP86]], float [[TMP83]], i64 3
-; SHADERTEST-NEXT:    [[TMP88:%.*]] = extractelement <4 x float> [[TMP87]], i32 0
-; SHADERTEST-NEXT:    [[TMP89:%.*]] = extractelement <4 x float> [[TMP87]], i32 1
-; SHADERTEST-NEXT:    [[TMP90:%.*]] = extractelement <4 x float> [[TMP87]], i32 2
-; SHADERTEST-NEXT:    [[TMP91:%.*]] = extractelement <4 x float> [[TMP87]], i32 3
-; SHADERTEST-NEXT:    [[TMP92:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[TMP88]], float [[TMP89]])
-; SHADERTEST-NEXT:    [[TMP93:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[TMP90]], float [[TMP91]])
-; SHADERTEST-NEXT:    call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> [[TMP92]], <2 x half> [[TMP93]], i1 true, i1 true)
+; SHADERTEST-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[DOT]], i64 0
+; SHADERTEST-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[DOT]], i64 1
+; SHADERTEST-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[DOT]], i64 2
+; SHADERTEST-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[DOT]], i64 3
+; SHADERTEST-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> poison, float [[TMP17]], i64 0
+; SHADERTEST-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP18]], i64 1
+; SHADERTEST-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP19]], i64 2
+; SHADERTEST-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP20]], i64 3
+; SHADERTEST-NEXT:    [[TMP25:%.*]] = extractelement <4 x float> [[TMP24]], i32 0
+; SHADERTEST-NEXT:    [[TMP26:%.*]] = extractelement <4 x float> [[TMP24]], i32 1
+; SHADERTEST-NEXT:    [[TMP27:%.*]] = extractelement <4 x float> [[TMP24]], i32 2
+; SHADERTEST-NEXT:    [[TMP28:%.*]] = extractelement <4 x float> [[TMP24]], i32 3
+; SHADERTEST-NEXT:    [[TMP29:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[TMP25]], float [[TMP26]])
+; SHADERTEST-NEXT:    [[TMP30:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[TMP27]], float [[TMP28]])
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> [[TMP29]], <2 x half> [[TMP30]], i1 true, i1 true)
 ; SHADERTEST-NEXT:    ret void
 ;
diff --git a/llpc/test/shaderdb/general/PipelineVsFs_TestNullFs.pipe b/llpc/test/shaderdb/general/PipelineVsFs_TestNullFs.pipe
index 5b7d55e278..81db59dc1d 100644
--- a/llpc/test/shaderdb/general/PipelineVsFs_TestNullFs.pipe
+++ b/llpc/test/shaderdb/general/PipelineVsFs_TestNullFs.pipe
@@ -167,7 +167,7 @@ colorBuffer[0].blendSrcAlphaToColor = 0
 ; CHECK-NEXT:       .ps:
 ; CHECK-NEXT:         .checksum_value: 0
 ; CHECK-NEXT:         .debug_mode:     false
-; CHECK-NEXT:         .entry_point:    _amdgpu_ps_main
+; CHECK-NEXT:         .entry_point{{(_symbol)?}}:    _amdgpu_ps_main
 ; CHECK-NEXT:         .float_mode:     0xc0
 ; CHECK-NEXT:         .ieee_mode:      false
 ; CHECK:         .mem_ordered:    true
diff --git a/llpc/test/shaderdb/general/PipelineVsFs_TestUberShader.pipe b/llpc/test/shaderdb/general/PipelineVsFs_TestUberShader.pipe
index 0cdd2055b8..a2bbde6e2c 100644
--- a/llpc/test/shaderdb/general/PipelineVsFs_TestUberShader.pipe
+++ b/llpc/test/shaderdb/general/PipelineVsFs_TestUberShader.pipe
@@ -94,7 +94,7 @@ userDataNode[2].visibility = 2
 userDataNode[2].type = DescriptorBufferCompact
 userDataNode[2].offsetInDwords = 2
 userDataNode[2].sizeInDwords = 2
-userDataNode[2].set = 0xFFFFFFFF
+userDataNode[2].set = 0xFFFFFFF0
 userDataNode[2].binding = 5
 
 [GraphicsPipelineState]
diff --git a/llpc/test/shaderdb/general/SubgroupShuffleIndexConstant.comp b/llpc/test/shaderdb/general/SubgroupShuffleIndexConstant.comp
new file mode 100644
index 0000000000..0a6d93ce97
--- /dev/null
+++ b/llpc/test/shaderdb/general/SubgroupShuffleIndexConstant.comp
@@ -0,0 +1,25 @@
+// RUN: amdllpc -gfxip 11.0 -filetype=asm -o - %s | FileCheck -check-prefix=GFX11 %s
+
+// GFX11-NOT: ds_bpermute_b32
+// GFX11: v_readlane_b32
+// GFX11-NOT: ds_bpermute_b32
+
+#version 450
+#extension GL_KHR_shader_subgroup_shuffle : enable
+
+layout(local_size_x = 32) in;
+
+layout(set = 0, binding = 0, std430) buffer InBuffer {
+    int a[32];
+} inbuf;
+
+layout(set = 0, binding = 1, std430) buffer OutBuffer {
+    int b[32];
+} outbuf;
+
+void main() {
+    uint id = gl_SubgroupInvocationID;
+    int x = inbuf.a[id];
+    x = subgroupShuffle(x, 5);
+    outbuf.b[id] = x;
+}
diff --git a/llpc/test/shaderdb/general/SubgroupShuffleIndexDivergent.comp b/llpc/test/shaderdb/general/SubgroupShuffleIndexDivergent.comp
new file mode 100644
index 0000000000..517f25cacb
--- /dev/null
+++ b/llpc/test/shaderdb/general/SubgroupShuffleIndexDivergent.comp
@@ -0,0 +1,31 @@
+// RUN: amdllpc -gfxip 11.0 -filetype=asm -o - %s | FileCheck -check-prefix=GFX11 %s
+
+// TODO: Should use Wave32 and a single ds_bpermute_b32
+
+// GFX11: v_permlane64_b32
+// GFX11: ds_bpermute_b32
+// GFX11: ds_bpermute_b32
+
+#version 450
+#extension GL_KHR_shader_subgroup_shuffle : enable
+
+layout(local_size_x = 32) in;
+
+layout(set = 0, binding = 0, std430) buffer InBuffer {
+    int a[32];
+} inbuf;
+
+layout(set = 0, binding = 1, std430) buffer OutBuffer {
+    int b[32];
+} outbuf;
+
+layout(set = 0, binding = 1, std430) buffer PermBuffer {
+    int perm[32];
+} permbuf;
+
+void main() {
+    uint id = gl_SubgroupInvocationID;
+    int x = inbuf.a[id];
+    x = subgroupShuffle(x, permbuf.perm[id]);
+    outbuf.b[id] = x;
+}
diff --git a/llpc/test/shaderdb/general/SubgroupShuffleIndexUniform.comp b/llpc/test/shaderdb/general/SubgroupShuffleIndexUniform.comp
new file mode 100644
index 0000000000..4d7cb54b95
--- /dev/null
+++ b/llpc/test/shaderdb/general/SubgroupShuffleIndexUniform.comp
@@ -0,0 +1,31 @@
+// RUN: amdllpc -gfxip 11.0 -filetype=asm -o - %s | FileCheck -check-prefix=GFX11 %s
+
+// TODO: Should use v_readlane
+
+// GFX11: v_permlane64_b32
+// GFX11: ds_bpermute_b32
+// GFX11: ds_bpermute_b32
+
+#version 450
+#extension GL_KHR_shader_subgroup_shuffle : enable
+
+layout(local_size_x = 32) in;
+
+layout(push_constant) uniform constants {
+    uint lane;
+};
+
+layout(set = 0, binding = 0, std430) buffer InBuffer {
+    int a[32];
+} inbuf;
+
+layout(set = 0, binding = 1, std430) buffer OutBuffer {
+    int b[32];
+} outbuf;
+
+void main() {
+    uint id = gl_SubgroupInvocationID;
+    int x = inbuf.a[id];
+    x = subgroupShuffle(x, lane);
+    outbuf.b[id] = x;
+}
diff --git a/llpc/test/shaderdb/general/TestPatchBufferOp.comp b/llpc/test/shaderdb/general/TestPatchBufferOp.comp
index 4e9bb824e8..38f271aa03 100644
--- a/llpc/test/shaderdb/general/TestPatchBufferOp.comp
+++ b/llpc/test/shaderdb/general/TestPatchBufferOp.comp
@@ -1,4 +1,4 @@
-// This test case checks whether a phi is well-handed in PatchBufferOp pass. This shader will result in a phi in IR that
+// This test case checks whether a phi is well-handed in LowerBufferOperations pass. This shader will result in a phi in IR that
 // one of the incoming value comes from downstream of the control flow.
 // BEGIN_SHADERTEST
 /*
diff --git a/llpc/test/shaderdb/general/TestWorkgroupIdOpt.comp b/llpc/test/shaderdb/general/TestWorkgroupIdOpt.comp
index 34a66b5b96..e7668ced75 100644
--- a/llpc/test/shaderdb/general/TestWorkgroupIdOpt.comp
+++ b/llpc/test/shaderdb/general/TestWorkgroupIdOpt.comp
@@ -15,8 +15,9 @@ void main()
 {
     test = gl_WorkGroupID.x;
 }
+
 // CHECK-LABEL: define {{[^@]+}}@_amdgpu_cs_main
-// CHECK-SAME: (i32 inreg noundef [[GLOBALTABLE:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[WORKGROUPID1:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], <3 x i32> noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !5 !lgc.shaderstage !6 {
+// CHECK-SAME: (i32 inreg noundef [[GLOBALTABLE:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[WORKGROUPID1:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], <3 x i32> noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !6 !lgc.shaderstage !7 {
 // CHECK-NEXT:  .entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
 // CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], -4294967296
@@ -32,7 +33,7 @@ void main()
 // CHECK: attributes #[[ATTR0]] = { alwaysinline nounwind memory(readwrite) "amdgpu-flat-work-group-size"="256,256" "amdgpu-memory-bound"="false" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-prealloc-sgpr-spill-vgprs" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "amdgpu-work-group-info-arg-no"="3" "denormal-fp-math-f32"="preserve-sign" "target-features"=",+wavefrontsize64,+cumode,+enable-flat-scratch" }
 // CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 // CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
-// CHECK: attributes #[[ATTR3:[0-9]+]] = {{{.*}} nounwind willreturn memory(none) }
+// CHECK: attributes #[[ATTR3:[0-9]+]] = { nodivergencesource nounwind willreturn memory(none) }
 // CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(write) }
 //.
 // CHECK: [[META0:![0-9]+]] = !{i32 16, i32 16, i32 1}
diff --git a/llpc/test/shaderdb/general/UndefVertexOutput.spvasm b/llpc/test/shaderdb/general/UndefVertexOutput.spvasm
index 7f5c983ad2..4e4fb93e04 100644
--- a/llpc/test/shaderdb/general/UndefVertexOutput.spvasm
+++ b/llpc/test/shaderdb/general/UndefVertexOutput.spvasm
@@ -93,8 +93,8 @@
 ; CHECK-NEXT:    tbuffer_load_format_xyzw v[0:3], v0, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    exp pos0 v0, v1, v2, v3 done
-; CHECK-NEXT:    exp param2 v5, v4, v4, v5
-; CHECK-NEXT:    exp param1 v4, v4, v5, v5
 ; CHECK-NEXT:    exp param3 off, v4, off, off
+; CHECK-NEXT:    exp param1 v4, v4, v5, v5
+; CHECK-NEXT:    exp param2 v5, v4, v4, v5
 ; CHECK-NEXT:  .LBB0_6:
 ; CHECK-NEXT:    s_endpgm
diff --git a/llpc/test/shaderdb/gfx10/PipelineVsFs_TestVsOutMiscSideBusEna.pipe b/llpc/test/shaderdb/gfx10/PipelineVsFs_TestVsOutMiscSideBusEna.pipe
index b68f5d6674..e12fc9384e 100644
--- a/llpc/test/shaderdb/gfx10/PipelineVsFs_TestVsOutMiscSideBusEna.pipe
+++ b/llpc/test/shaderdb/gfx10/PipelineVsFs_TestVsOutMiscSideBusEna.pipe
@@ -26,7 +26,7 @@ entryPoint = main
 ; SHADERTEST-LABEL: amdgpu_vs_main:
 ; SHADERTEST:         v_mov_b32_e32 v0, 1.0
 ; SHADERTEST-NEXT:    exp pos0 v0, v0, v0, v0
-; SHADERTEST-NEXT:    exp pos1 v0, v0, v0, v0 done
+; SHADERTEST-NEXT:    exp pos1 v0, off, off, off done
 ; SHADERTEST-NEXT:    s_endpgm
 ;
 ; SHADERTEST-LABEL: amdgpu_ps_main:
diff --git a/llpc/test/shaderdb/gfx11/SgprUserDataInit_Cs.pipe b/llpc/test/shaderdb/gfx11/SgprUserDataInit_Cs.pipe
index 55d22aff84..8dc598c124 100644
--- a/llpc/test/shaderdb/gfx11/SgprUserDataInit_Cs.pipe
+++ b/llpc/test/shaderdb/gfx11/SgprUserDataInit_Cs.pipe
@@ -160,7 +160,7 @@ options.threadGroupSwizzleMode = Default
 ; CHECK-NEXT:       .cs:
 ; CHECK-NEXT:         .checksum_value: 0x{{[0-9a-f]+}}
 ; CHECK-NEXT:         .debug_mode:     false
-; CHECK-NEXT:         .entry_point:    _amdgpu_cs_main
+; CHECK-NEXT:         .entry_point{{(_symbol)?}}:    _amdgpu_cs_main
 ; CHECK-NEXT:         .excp_en:        0
 ; CHECK-NEXT:         .float_mode:     0xc0
 ; CHECK-NEXT:         .ieee_mode:      false
diff --git a/llpc/test/shaderdb/gfx11/SgprUserDataInit_Fs.pipe b/llpc/test/shaderdb/gfx11/SgprUserDataInit_Fs.pipe
index af434be2e9..3942458c66 100644
--- a/llpc/test/shaderdb/gfx11/SgprUserDataInit_Fs.pipe
+++ b/llpc/test/shaderdb/gfx11/SgprUserDataInit_Fs.pipe
@@ -292,7 +292,7 @@ colorBuffer[0].blendSrcAlphaToColor = 0
 ; CHECK-NEXT:       .gs:
 ; CHECK-NEXT:         .checksum_value: 0x3e7a1455
 ; CHECK-NEXT:         .debug_mode:     false
-; CHECK-NEXT:         .entry_point:    _amdgpu_gs_main
+; CHECK-NEXT:         .entry_point{{(_symbol)?}}:    _amdgpu_gs_main
 ; CHECK-NEXT:         .float_mode:     0xc0
 ; CHECK-NEXT:         .ieee_mode:      false
 ; CHECK-NEXT:         .image_op:       false
@@ -345,7 +345,7 @@ colorBuffer[0].blendSrcAlphaToColor = 0
 ; CHECK-NEXT:       .ps:
 ; CHECK-NEXT:         .checksum_value: 0x2cbaf88c
 ; CHECK-NEXT:         .debug_mode:     false
-; CHECK-NEXT:         .entry_point:    _amdgpu_ps_main
+; CHECK-NEXT:         .entry_point{{(_symbol)?}}:    _amdgpu_ps_main
 ; CHECK-NEXT:         .float_mode:     0xc0
 ; CHECK-NEXT:         .ieee_mode:      false
 ; CHECK-NEXT:         .image_op:       true
diff --git a/llpc/test/shaderdb/gfx11/TessFactorStoreWithOpt.pipe b/llpc/test/shaderdb/gfx11/TessFactorStoreWithOpt.pipe
index 1242f626e8..46aa926a17 100644
--- a/llpc/test/shaderdb/gfx11/TessFactorStoreWithOpt.pipe
+++ b/llpc/test/shaderdb/gfx11/TessFactorStoreWithOpt.pipe
@@ -5,7 +5,7 @@
 ; SHADERTEST-LABEL: .distribHsPatchCount:
 ; SHADERTEST-NEXT: %[[HS_PATCH_COUNT_SHIFT:[^ ,]*]] = lshr i32 %mergeWaveInfo, 16
 ; SHADERTEST-NEXT: %[[HS_PATCH_COUNT:[^ ,]*]] = and i32 %[[HS_PATCH_COUNT_SHIFT]], 255
-; SHADERTEST-NEXT: store i32 %[[HS_PATCH_COUNT]], ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @Lds.HS, i32 1536), align 4
+; SHADERTEST-NEXT: store i32 %[[HS_PATCH_COUNT]], ptr addrspace(3) @Lds.HS, align 4
 ; SHADERTEST-NEXT: br label %.endDistribHsPatchCount
 
 ; SHADERTEST-LABEL: .endDistribHsPatchCount:
@@ -16,47 +16,41 @@
 ; SHADERTEST-NEXT: br i1 %validHsVert, label %.beginHs, label %.endHs
 
 ; SHADERTEST-LABEL: .endHs:
-; SHADERTEST: %[[HS_PATCH_COUNT:[^ ,]*]] = load i32, ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @Lds.HS, i32 1536), align 4
+; SHADERTEST: %[[HS_PATCH_COUNT:[^ ,]*]] = load i32, ptr addrspace(3) @Lds.HS, align 4
 ; SHADERTEST: %hsPatchCount = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %[[HS_PATCH_COUNT]])
 ; SHADERTEST: %validHsPatch = icmp ult i32 %threadIdInGroup, %hsPatchCount
 ; SHADERTEST: br i1 %validHsPatch, label %.checkSpecialTfInWave, label %.endCheckSpecialTfInWave
 
 ; SHADERTEST-LABEL: .checkSpecialTfInWave:
-; SHADERTEST-NEXT: %[[OUTER_TF_OFFSET_0:[^ ,]*]] = mul i32 %threadIdInGroup, 24
+; SHADERTEST-NEXT: %[[OUTER_TF_OFFSET_0:[^ ,]*]] = mul i32 %threadIdInGroup, 28
 ; SHADERTEST-NEXT: %[[OUTER_TF_I_PTR:[^ ,]*]] = getelementptr i8, ptr addrspace(3) @Lds.HS, i32 %[[OUTER_TF_OFFSET_0]]
-; SHADERTEST-NEXT: %[[OUTER_TF:[^ ,]*]] = load <4 x float>, ptr addrspace(3) %[[OUTER_TF_I_PTR]], align 4
-; SHADERTEST-NEXT: %[[INNER_TF_OFFSET_0:[^ ,]*]] = mul i32 %threadIdInGroup, 24
+; SHADERTEST-NEXT: %[[OUTER_TF_PTR:[^ ,]*]] = getelementptr i8, ptr addrspace(3) %[[OUTER_TF_I_PTR]], i32 1316
+; SHADERTEST-NEXT: %[[OUTER_TF:[^ ,]*]] = load <4 x float>, ptr addrspace(3) %[[OUTER_TF_PTR]], align 4
+; SHADERTEST-NEXT: %[[INNER_TF_OFFSET_0:[^ ,]*]] = mul i32 %threadIdInGroup, 28
 ; SHADERTEST-NEXT: %[[INNER_TF_I_PTR:[^ ,]*]] = getelementptr i8, ptr addrspace(3) @Lds.HS, i32 %[[INNER_TF_OFFSET_0]]
-; SHADERTEST-NEXT: %[[INNER_TF_PTR:[^ ,]*]] = getelementptr {{(i8|i32)}}, ptr addrspace(3) %[[INNER_TF_I_PTR]], i32 {{(16|4)}}
+; SHADERTEST-NEXT: %[[INNER_TF_PTR:[^ ,]*]] = getelementptr i8, ptr addrspace(3) %[[INNER_TF_I_PTR]], i32 1332
 ; SHADERTEST-NEXT: %[[INNER_TF:[^ ,]*]] = load <2 x float>, ptr addrspace(3) %[[INNER_TF_PTR]], align 4
 ; SHADERTEST-NEXT: %[[OUTER_TF_0:[^ ,]*]] = extractelement <4 x float> %[[OUTER_TF]], i64 0
-; SHADERTEST-NEXT: %[[IS_ONE_0:[^ ,]*]] = fcmp oeq float %[[OUTER_TF_0]], 1.000000e+00
-; SHADERTEST-NEXT: %[[IS_ZERO_0:[^ ,]*]] = fcmp oeq float %[[OUTER_TF_0]], 0.000000e+00
 ; SHADERTEST-NEXT: %[[OUTER_TF_1:[^ ,]*]] = extractelement <4 x float> %[[OUTER_TF]], i64 1
-; SHADERTEST-NEXT: %[[IS_ONE_1:[^ ,]*]] = fcmp oeq float %[[OUTER_TF_1]], 1.000000e+00
-; SHADERTEST-NEXT: %[[IS_ZERO_1:[^ ,]*]] = fcmp oeq float %[[OUTER_TF_1]], 0.000000e+00
-; SHADERTEST-NEXT: %[[ALL_ONES_0:[^ ,]*]] = and i1 %[[IS_ONE_0]], %[[IS_ONE_1]]
-; SHADERTEST-NEXT: %[[ALL_ZEROS_0:[^ ,]*]] = and i1 %[[IS_ZERO_0]], %[[IS_ZERO_1]]
+; SHADERTEST-NEXT: %[[MIN_TF_0:[^ ,]*]] = call nnan float @llvm.minimum.f32(float %[[OUTER_TF_0]], float %[[OUTER_TF_1]])
+; SHADERTEST-NEXT: %[[MAX_TF_0:[^ ,]*]] = call nnan float @llvm.maximum.f32(float %[[OUTER_TF_0]], float %[[OUTER_TF_1]])
 ; SHADERTEST-NEXT: %[[OUTER_TF_2:[^ ,]*]] = extractelement <4 x float> %[[OUTER_TF]], i64 2
-; SHADERTEST-NEXT: %[[IS_ONE_2:[^ ,]*]] = fcmp oeq float %[[OUTER_TF_2]], 1.000000e+00
-; SHADERTEST-NEXT: %[[IS_ZERO_2:[^ ,]*]] = fcmp oeq float %[[OUTER_TF_2]], 0.000000e+00
-; SHADERTEST-NEXT: %[[ALL_ONES_1:[^ ,]*]] = and i1 %[[ALL_ONES_0]], %[[IS_ONE_2]]
-; SHADERTEST-NEXT: %[[ALL_ZEROS_1:[^ ,]*]] = and i1 %[[ALL_ZEROS_0]], %[[IS_ZERO_2]]
+; SHADERTEST-NEXT: %[[MIN_TF_1:[^ ,]*]] = call nnan float @llvm.minimum.f32(float %[[MIN_TF_0]], float %[[OUTER_TF_2]])
+; SHADERTEST-NEXT: %[[MAX_TF_1:[^ ,]*]] = call nnan float @llvm.maximum.f32(float %[[MAX_TF_0]], float %[[OUTER_TF_2]])
 ; SHADERTEST-NEXT: %[[OUTER_TF_3:[^ ,]*]] = extractelement <4 x float> %[[OUTER_TF]], i64 3
-; SHADERTEST-NEXT: %[[IS_ONE_3:[^ ,]*]] = fcmp oeq float %[[OUTER_TF_3]], 1.000000e+00
-; SHADERTEST-NEXT: %[[IS_ZERO_3:[^ ,]*]] = fcmp oeq float %[[OUTER_TF_3]], 0.000000e+00
-; SHADERTEST-NEXT: %[[ALL_ONES_2:[^ ,]*]] = and i1 %[[ALL_ONES_1]], %[[IS_ONE_3]]
-; SHADERTEST-NEXT: %[[ALL_ZEROS_2:[^ ,]*]] = and i1 %[[ALL_ZEROS_1]], %[[IS_ZERO_3]]
+; SHADERTEST-NEXT: %[[MIN_TF_2:[^ ,]*]] = call nnan float @llvm.minimum.f32(float %[[MIN_TF_1]], float %[[OUTER_TF_3]])
+; SHADERTEST-NEXT: %[[MAX_TF_2:[^ ,]*]] = call nnan float @llvm.maximum.f32(float %[[MAX_TF_1]], float %[[OUTER_TF_3]])
 ; SHADERTEST-NEXT: %[[INNER_TF_0:[^ ,]*]] = extractelement <2 x float> %[[INNER_TF]], i64 0
-; SHADERTEST-NEXT: %[[IS_ONE_4:[^ ,]*]] = fcmp oeq float %[[INNER_TF_0]], 1.000000e+00
-; SHADERTEST-NEXT: %[[IS_ZERO_4:[^ ,]*]] = fcmp oeq float %[[INNER_TF_0]], 0.000000e+00
-; SHADERTEST-NEXT: %[[ALL_ONES_3:[^ ,]*]] = and i1 %[[ALL_ONES_2]], %[[IS_ONE_4]]
-; SHADERTEST-NEXT: %[[ALL_ZEROS_3:[^ ,]*]] = and i1 %[[ALL_ZEROS_2]], %[[IS_ZERO_4]]
+; SHADERTEST-NEXT: %[[MIN_TF_3:[^ ,]*]] = call nnan float @llvm.minimum.f32(float %[[MIN_TF_2]], float %[[INNER_TF_0]])
+; SHADERTEST-NEXT: %[[MAX_TF_3:[^ ,]*]] = call nnan float @llvm.maximum.f32(float %[[MAX_TF_2]], float %[[INNER_TF_0]])
 ; SHADERTEST-NEXT: %[[INNER_TF_1:[^ ,]*]] = extractelement <2 x float> %[[INNER_TF]], i64 1
-; SHADERTEST-NEXT: %[[IS_ONE_5:[^ ,]*]] = fcmp oeq float %[[INNER_TF_1]], 1.000000e+00
-; SHADERTEST-NEXT: %[[IS_ZERO_5:[^ ,]*]] = fcmp oeq float %[[INNER_TF_1]], 0.000000e+00
-; SHADERTEST-NEXT: %[[ALL_ONES:[^ ,]*]] = and i1 %[[ALL_ONES_3]], %[[IS_ONE_5]]
-; SHADERTEST-NEXT: %[[ALL_ZEROS:[^ ,]*]] = and i1 %[[ALL_ZEROS_3]], %[[IS_ZERO_5]]
+; SHADERTEST-NEXT: %[[MIN_TF:[^ ,]*]] = call nnan float @llvm.minimum.f32(float %[[MIN_TF_3]], float %[[INNER_TF_1]])
+; SHADERTEST-NEXT: %[[MAX_TF:[^ ,]*]] = call nnan float @llvm.maximum.f32(float %[[MAX_TF_3]], float %[[INNER_TF_1]])
+; SHADERTEST-NEXT: %[[MIN_TF_EQ_MAX_TF:[^ ,]*]] = fcmp nnan oeq float %[[MIN_TF]], %[[MAX_TF]]
+; SHADERTEST-NEXT: %[[IS_ONE:[^ ,]*]] = fcmp nnan oeq float %[[MIN_TF]], 1.000000e+00
+; SHADERTEST-NEXT: %[[IS_ZERO:[^ ,]*]] = fcmp nnan oeq float %[[MIN_TF]], 0.000000e+00
+; SHADERTEST-NEXT: %[[ALL_ONES:[^ ,]*]] = and i1 %[[MIN_TF_EQ_MAX_TF]], %[[IS_ONE]]
+; SHADERTEST-NEXT: %[[ALL_ZEROS:[^ ,]*]] = and i1 %[[MIN_TF_EQ_MAX_TF]], %[[IS_ZERO]]
 ; SHADERTEST-NEXT: %[[BALLOT_MASK:[^ ,]*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; SHADERTEST-NEXT: %[[ALL_ONES_MASK:[^ ,]*]] = call i64 @llvm.amdgcn.ballot.i64(i1 %[[ALL_ONES]])
 ; SHADERTEST-NEXT: %[[ALL_ONES_IN_WAVE:[^ ,]*]] = icmp eq i64 %[[ALL_ONES_MASK]], %[[BALLOT_MASK]]
@@ -65,8 +59,8 @@
 ; SHADERTEST-NEXT: br label %.endCheckSpecialTfInWave
 
 ; SHADERTEST-LABEL: .endCheckSpecialTfInWave:
-; SHADERTEST-NEXT: %outerTf = phi <4 x float> [ poison, %.endHs ], [ %[[OUTER_TF]], %.checkSpecialTfInWave ]
-; SHADERTEST-NEXT: %innerTf = phi <2 x float> [ poison, %.endHs ], [ %[[INNER_TF]], %.checkSpecialTfInWave ]
+; SHADERTEST-NEXT: %outerTf = phi nnan <4 x float> [ poison, %.endHs ], [ %[[OUTER_TF]], %.checkSpecialTfInWave ]
+; SHADERTEST-NEXT: %innerTf = phi nnan <2 x float> [ poison, %.endHs ], [ %[[INNER_TF]], %.checkSpecialTfInWave ]
 ; SHADERTEST-NEXT: %isAllOnesTfInWave = phi i1 [ true, %.endHs ], [ %[[ALL_ONES_IN_WAVE]], %.checkSpecialTfInWave ]
 ; SHADERTEST-NEXT: %isAllZerosTfInWave = phi i1 [ true, %.endHs ], [ %[[ALL_ZEROS_IN_WAVE]], %.checkSpecialTfInWave ]
 ; SHADERTEST-NEXT: %[[HS_PATCH_COUNT_ADJUST:[^ ,]*]] = add i32 %hsPatchCount, 63
@@ -76,13 +70,13 @@
 ; SHADERTEST-LABEL: .handleMultiWave:
 ; SHADERTEST-NEXT: %hsPatchWaveCount = lshr i32 %[[HS_PATCH_COUNT_ADJUST]], 6
 ; SHADERTEST-NEXT: %[[WAVE_ID_OFFSET:[^ ,]*]] = shl nuw nsw i32 %waveIdInGroup, 1
-; SHADERTEST-NEXT: %[[ALL_ONES_OFFSET:[^ ,]*]] = or {{.*}}i32 %[[WAVE_ID_OFFSET]], 385
+; SHADERTEST-NEXT: %[[ALL_ONES_OFFSET:[^ ,]*]] = or {{.*}}i32 %[[WAVE_ID_OFFSET]], 1
 ; SHADERTEST-NEXT: %[[IS_ALL_ONES_TF:[^ ,]*]] = zext i1 %isAllOnesTfInWave to i32
 ; SHADERTEST-NEXT: %[[ALL_ONES_PTR:[^ ,]*]] = getelementptr i32, ptr addrspace(3) @Lds.HS, i32 %[[ALL_ONES_OFFSET]]
 ; SHADERTEST-NEXT: store i32 %[[IS_ALL_ONES_TF]], ptr addrspace(3) %[[ALL_ONES_PTR]], align 4
 ; SHADERTEST-NEXT: %[[IS_ALL_ZEROS_TF:[^ ,]*]] = zext i1 %isAllZerosTfInWave to i32
 ; SHADERTEST-NEXT: %[[ALL_ZEROS_I_PTR:[^ ,]*]] = getelementptr i32, ptr addrspace(3) @Lds.HS, i32 %[[WAVE_ID_OFFSET]]
-; SHADERTEST-NEXT: %[[ALL_ZEROS_PTR:[^ ,]*]] = getelementptr {{(i8|i32)}}, ptr addrspace(3) %[[ALL_ZEROS_I_PTR]], i32 {{(1544|386)}}
+; SHADERTEST-NEXT: %[[ALL_ZEROS_PTR:[^ ,]*]] = getelementptr i8, ptr addrspace(3) %[[ALL_ZEROS_I_PTR]], i32 8
 ; SHADERTEST-NEXT: store i32 %[[IS_ALL_ZEROS_TF]], ptr addrspace(3) %[[ALL_ZEROS_PTR]], align 4
 ; SHADERTEST-NEXT: fence syncscope("workgroup") release
 ; SHADERTEST-NEXT: call void @llvm.amdgcn.s.barrier()
@@ -92,12 +86,12 @@
 
 ; SHADERTEST-LABEL: .checkSpecialTfInGroup:
 ; SHADERTEST-NEXT: %[[THREAD_ID_OFFSET:[^ ,]*]] = shl i32 %threadIdInWave, 1
-; SHADERTEST-NEXT: %[[ALL_ONES_I_PTR:[^ ,]*]] = getelementptr i32, ptr addrspace(3) @Lds.HS, i32 %[[THREAD_ID_OFFSET]]
-; SHADERTEST-NEXT: %[[ALL_ONES_PTR:[^ ,]*]] = getelementptr {{(i8|i32)}}, ptr addrspace(3) %[[ALL_ONES_I_PTR]], i32 {{(1540|385)}}
+; SHADERTEST-NEXT: %[[ALL_ONES_OFFSET:[^ ,]*]] = or disjoint i32 %[[THREAD_ID_OFFSET]], 1
+; SHADERTEST-NEXT: %[[ALL_ONES_PTR:[^ ,]*]] = getelementptr i32, ptr addrspace(3) @Lds.HS, i32 %[[ALL_ONES_OFFSET]]
 ; SHADERTEST-NEXT: %[[IS_ALL_ONES_TF:[^ ,]*]] = load i32, ptr addrspace(3) %[[ALL_ONES_PTR]], align 4
 ; SHADERTEST-NEXT: %[[ALL_ONES_VALUE:[^ ,]*]] = trunc i32 %[[IS_ALL_ONES_TF]] to i1
 ; SHADERTEST-NEXT: %[[ALL_ZEROS_I_PTR:[^ ,]*]] = getelementptr i32, ptr addrspace(3) @Lds.HS, i32 %[[THREAD_ID_OFFSET]]
-; SHADERTEST-NEXT: %[[ALL_ZEROS_PTR:[^ ,]*]] = getelementptr {{(i8|i32)}}, ptr addrspace(3) %[[ALL_ZEROS_I_PTR]], i32 {{(1544|386)}}
+; SHADERTEST-NEXT: %[[ALL_ZEROS_PTR:[^ ,]*]] = getelementptr i8, ptr addrspace(3) %[[ALL_ZEROS_I_PTR]], i32 8
 ; SHADERTEST-NEXT: %[[IS_ALL_ZEROS_TF:[^ ,]*]] = load i32, ptr addrspace(3) %[[ALL_ZEROS_PTR]], align 4
 ; SHADERTEST-NEXT: %[[ALL_ZERO_VALUE:[^ ,]*]] = trunc i32 %[[IS_ALL_ZEROS_TF]] to i1
 ; SHADERTEST-NEXT: %[[BALLOT_MASK:[^ ,]*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
@@ -110,7 +104,7 @@
 ; SHADERTEST-LABEL: .endHandleMultiWave:
 ; SHADERTEST-NEXT: %isAllOnesTf = phi i1 [ %isAllOnesTfInWave, %.endCheckSpecialTfInWave ], [ true, %.handleMultiWave ], [ %[[ALL_ONES_IN_GROUP]], %.checkSpecialTfInGroup ]
 ; SHADERTEST-NEXT: %isAllZerosTf = phi i1 [ %isAllZerosTfInWave, %.endCheckSpecialTfInWave ], [ true, %.handleMultiWave ], [ %[[ALL_ZEROS_IN_GROUP]], %.checkSpecialTfInGroup ]
-; SHADERTEST-NEXT: br i1 %validHsPatch, label %.tryStoreTf, label %.endTryStoreTf
+; SHADERTEST-NEXT: br i1 %validHsPatch, label %.tryStoreTf, label %.endTryStoreHsOutputs
 
 ; SHADERTEST-LABEL: .tryStoreTf:
 ; SHADERTEST-NEXT: %isSpecialTf = or i1 %isAllOnesTf, %isAllZerosTf
@@ -118,23 +112,23 @@
 
 ; SHADERTEST-LABEL: .checkSendTfMessage:
 ; SHADERTEST-NEXT: %[[FIRST_WAVE:[^ ,]*]] = icmp eq i32 %waveIdInGroup, 0
-; SHADERTEST-NEXT: br i1 %[[FIRST_WAVE]], label %.sendTfMessage, label %.endTryStoreTf
+; SHADERTEST-NEXT: br i1 %[[FIRST_WAVE]], label %.sendTfMessage, label %.endTryStoreHsOutputs
 
 ; SHADERTEST-LABEL: .sendTfMessage:
 ; SHADERTEST-NEXT: %[[IS_ALL_ONES_TF:[^ ,]*]] = zext i1 %isAllOnesTf to i32
 ; SHADERTEST-NEXT: call void @llvm.amdgcn.s.sendmsg(i32 2, i32 %[[IS_ALL_ONES_TF]])
-; SHADERTEST-NEXT: br label %.endTryStoreTf
+; SHADERTEST-NEXT: br label %.endTryStoreHsOutputs
 
 ; SHADERTEST-LABEL: .storeTf:
-; SHADERTEST: %tfBufferDescPtr = getelementptr {{i8|<4 x i32>}}, ptr addrspace(4) %globalTablePtr, i64 {{144|9}}
+; SHADERTEST: %tfBufferDescPtr = getelementptr i8, ptr addrspace(4) %globalTablePtr, i64 144
 ; SHADERTEST-NEXT: %tfBufferDesc = load <4 x i32>, ptr addrspace(4) %tfBufferDescPtr, align 16
 ; SHADERTEST-NEXT: %[[OUTER_TF_OFFSET:[^ ,]*]] = mul i32 %threadIdInGroup, 24
-; SHADERTEST-NEXT: call void @llvm.amdgcn.raw.tbuffer.store.v4f32{{(\.v4i32)?}}(<4 x float> %outerTf, <4 x i32> %tfBufferDesc, i32 %[[OUTER_TF_OFFSET]], i32 %tfBufferBase, i32 63, i32 1)
+; SHADERTEST-NEXT: call void @llvm.amdgcn.raw.buffer.store.v4f32{{(.v4i32)?}}(<4 x float> %outerTf, <4 x i32> %tfBufferDesc, i32 %[[OUTER_TF_OFFSET]], i32 %tfBufferBase, i32 1)
 ; SHADERTEST-NEXT: %[[INNER_TF_OFFSET:[^ ,]*]] = add i32 %[[OUTER_TF_OFFSET]], 16
-; SHADERTEST-NEXT: call void @llvm.amdgcn.raw.tbuffer.store.v2f32{{(\.v4i32)?}}(<2 x float> %innerTf, <4 x i32> %tfBufferDesc, i32 %[[INNER_TF_OFFSET]], i32 %tfBufferBase, i32 50, i32 1)
-; SHADERTEST-NEXT: br label %.endTryStoreTf
+; SHADERTEST-NEXT: call void @llvm.amdgcn.raw.buffer.store.v2f32{{(.v4i32)?}}(<2 x float> %innerTf, <4 x i32> %tfBufferDesc, i32 %[[INNER_TF_OFFSET]], i32 %tfBufferBase, i32 1)
+; SHADERTEST-NEXT: br label %.endTryStoreHsOutputs
 
-; SHADERTEST-LABEL: .endTryStoreTf:
+; SHADERTEST-LABEL: .endTryStoreHsOutputs:
 ; SHADERTEST-NEXT: ret void
 
 [Version]
diff --git a/llpc/test/shaderdb/gfx11/TestGsXfbWithHole.pipe b/llpc/test/shaderdb/gfx11/TestGsXfbWithHole.pipe
index 04025d52ee..9a46f7cdd2 100644
--- a/llpc/test/shaderdb/gfx11/TestGsXfbWithHole.pipe
+++ b/llpc/test/shaderdb/gfx11/TestGsXfbWithHole.pipe
@@ -5,11 +5,11 @@
 ; RUN: amdllpc -gfxip=11 -stop-after=lgc-generate-copy-shader -v %s | FileCheck -check-prefix=CHECK %s
 
 ; CHECK-LABEL: @lgc.shader.COPY.main(
-; CHECK: [[TMP1:%.*]] = call float @lgc.ngg.read.GS.output.f32(i32 0, i32 0, i32 0)
-; CHECK: [[TMP2:%.*]] = call float @lgc.ngg.read.GS.output.f32(i32 0, i32 2, i32 0)
-; CHECK: [[TMP3:%.*]] = call float @lgc.ngg.read.GS.output.f32(i32 0, i32 1, i32 0)
-; CHECK: call void @lgc.output.export.xfb.i32.i32.i32.f32(i32 0, i32 0, i32 0, float [[TMP1]])
-; CHECK: call void @lgc.output.export.xfb.i32.i32.i32.f32(i32 0, i32 8, i32 0, float [[TMP2]])
+; CHECK: [[TMP1:%.*]] = call float @lgc.ngg.read.gs.output__f32(i32 0, i32 0, i32 0)
+; CHECK: [[TMP2:%.*]] = call float @lgc.ngg.read.gs.output__f32(i32 0, i32 2, i32 0)
+; CHECK: [[TMP3:%.*]] = call float @lgc.ngg.read.gs.output__f32(i32 0, i32 1, i32 0)
+; CHECK: call void (...) @lgc.write.xfb.output(i32 0, i32 0, i32 0, float [[TMP1]])
+; CHECK: call void (...) @lgc.write.xfb.output(i32 0, i32 8, i32 0, float [[TMP2]])
 ; CHECK: [[TMP4:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i64 0
 ; CHECK: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP2]], i64 2
 ; CHECK: [[TMP6:%.*]] = insertelement <3 x float> [[TMP5]], float [[TMP3]], i64 1
diff --git a/llpc/test/shaderdb/gfx11/cooperativeMatrix/array-of-matrices.comp b/llpc/test/shaderdb/gfx11/cooperativeMatrix/array-of-matrices.comp
index a8ae335582..e85d669f8e 100644
--- a/llpc/test/shaderdb/gfx11/cooperativeMatrix/array-of-matrices.comp
+++ b/llpc/test/shaderdb/gfx11/cooperativeMatrix/array-of-matrices.comp
@@ -35,9 +35,9 @@ void main() {
 // CHECK-NEXT:    [[TMP0:%.*]] = call ptr addrspace(4) (...) @lgc.create.load.push.constants.ptr.p4()
 // CHECK-NEXT:    [[TMP1:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 0, i32 0, i32 2)
 // CHECK-NEXT:    [[TMP2:%.*]] = call ptr addrspace(7) @lgc.buffer.index(ptr addrspace(7) [[TMP1]], i32 16, i32 0)
-// CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) [[TMP2]], i32 32, i1 true, i32 1, i32 0, i32 0, i32 16)
+// CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) [[TMP2]], i32 32, i1 true, i32 1, i32 0, i32 0, i32 16, i32 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = call ptr addrspace(7) @lgc.buffer.index(ptr addrspace(7) [[TMP1]], i32 16, i32 32)
-// CHECK-NEXT:    [[LOAD2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) [[TMP3]], i32 32, i1 true, i32 1, i32 0, i32 0, i32 16)
+// CHECK-NEXT:    [[LOAD2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) [[TMP3]], i32 32, i1 true, i32 1, i32 0, i32 0, i32 16, i32 16)
 // CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP0]], align 4
 // CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 2
 // CHECK-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP11:%.*]]
@@ -57,6 +57,6 @@ void main() {
 // CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], <8 x float> [[LOAD2]], <8 x float> [[LOAD]]
 // CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP15]], <8 x float> [[TMP17]], <8 x float> zeroinitializer
 // CHECK-NEXT:    [[TMP19:%.*]] = call ptr addrspace(7) @lgc.buffer.index(ptr addrspace(7) [[TMP1]], i32 16, i32 64)
-// CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) [[TMP19]], i32 64, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP18]])
+// CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) [[TMP19]], i32 64, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP18]], i32 16)
 // CHECK-NEXT:    ret void
 //
diff --git a/llpc/test/shaderdb/gfx11/cooperativeMatrix/extract-insert.spvasm b/llpc/test/shaderdb/gfx11/cooperativeMatrix/extract-insert.spvasm
index bd9bae7354..903928baf1 100644
--- a/llpc/test/shaderdb/gfx11/cooperativeMatrix/extract-insert.spvasm
+++ b/llpc/test/shaderdb/gfx11/cooperativeMatrix/extract-insert.spvasm
@@ -124,14 +124,14 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 1, i32 0, i32 2)
 ; CHECK-NEXT:    [[TMP1:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 0, i32 0, i32 2)
 ; CHECK-NEXT:    [[TMP2:%.*]] = call ptr addrspace(7) @lgc.buffer.index(ptr addrspace(7) [[TMP1]], i32 16, i32 0)
-; CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) [[TMP2]], i32 32, i1 true, i32 1, i32 0, i32 0, i32 16)
+; CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) [[TMP2]], i32 32, i1 true, i32 1, i32 0, i32 0, i32 16, i32 16)
 ; CHECK-NEXT:    [[TMP3:%.*]] = call ptr addrspace(7) @lgc.buffer.index(ptr addrspace(7) [[TMP0]], i32 16, i32 0)
-; CHECK-NEXT:    [[LOAD1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) [[TMP3]], i32 32, i1 true, i32 1, i32 0, i32 0, i32 16)
+; CHECK-NEXT:    [[LOAD1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) [[TMP3]], i32 32, i1 true, i32 1, i32 0, i32 0, i32 16, i32 16)
 ; CHECK-NEXT:    br label [[TMP4:%.*]]
 ; CHECK:       4:
 ; CHECK-NEXT:    [[DOT012:%.*]] = phi i32 [ 0, [[DOTENTRY:%.*]] ], [ [[TMP12:%.*]], [[TMP7:%.*]] ]
 ; CHECK-NEXT:    [[DOT0:%.*]] = phi <8 x float> [ undef, [[DOTENTRY]] ], [ [[TMP11:%.*]], [[TMP7]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @lgc.cooperative.matrix.length(i32 0)
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @lgc.cooperative.matrix.length(i32 0, i32 16)
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt i32 [[DOT012]], [[TMP5]]
 ; CHECK-NEXT:    [[COND_FREEZE:%.*]] = freeze i1 [[TMP6]]
 ; CHECK-NEXT:    br i1 [[COND_FREEZE]], label [[TMP7]], label [[TMP13:%.*]]
@@ -141,10 +141,10 @@
 ; CHECK-NEXT:    [[TMP10:%.*]] = fmul reassoc nnan nsz arcp contract afn half [[TMP8]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP11]] = call <8 x float> (...) @lgc.cooperative.matrix.insert__v8f32(<8 x float> [[DOT0]], half [[TMP10]], i32 [[DOT012]], i32 1, i32 0)
 ; CHECK-NEXT:    [[TMP12]] = add i32 [[DOT012]], 1
-; CHECK-NEXT:    br label [[TMP4]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br label [[TMP4]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       13:
 ; CHECK-NEXT:    [[TMP14:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 2, i32 0, i32 2)
 ; CHECK-NEXT:    [[TMP15:%.*]] = call ptr addrspace(7) @lgc.buffer.index(ptr addrspace(7) [[TMP14]], i32 16, i32 0)
-; CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) [[TMP15]], i32 64, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[DOT0]])
+; CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) [[TMP15]], i32 64, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[DOT0]], i32 16)
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llpc/test/shaderdb/gfx11/cooperativeMatrix/loadstore-uvec4.comp b/llpc/test/shaderdb/gfx11/cooperativeMatrix/loadstore-uvec4.comp
index a41a7fee79..11f1aebfb5 100644
--- a/llpc/test/shaderdb/gfx11/cooperativeMatrix/loadstore-uvec4.comp
+++ b/llpc/test/shaderdb/gfx11/cooperativeMatrix/loadstore-uvec4.comp
@@ -25,8 +25,8 @@ void main() {
 // CHECK-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 1, i32 0, i32 2)
 // CHECK-NEXT:    [[TMP1:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 0, i32 0, i32 2)
 // CHECK-NEXT:    [[TMP2:%.*]] = call ptr addrspace(7) @lgc.buffer.index(ptr addrspace(7) [[TMP1]], i32 16, i32 0)
-// CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) [[TMP2]], i32 64, i1 true, i32 1, i32 0, i32 0, i32 16)
+// CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) [[TMP2]], i32 64, i1 true, i32 1, i32 0, i32 0, i32 16, i32 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = call ptr addrspace(7) @lgc.buffer.index(ptr addrspace(7) [[TMP0]], i32 16, i32 0)
-// CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) [[TMP3]], i32 64, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[LOAD]])
+// CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) [[TMP3]], i32 64, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[LOAD]], i32 16)
 // CHECK-NEXT:    ret void
 //
diff --git a/llpc/test/shaderdb/object/ObjInput_TestIndexingInterpOfInputArray_lit.frag b/llpc/test/shaderdb/object/ObjInput_TestIndexingInterpOfInputArray_lit.frag
index dd20457bd0..929e241c0b 100644
--- a/llpc/test/shaderdb/object/ObjInput_TestIndexingInterpOfInputArray_lit.frag
+++ b/llpc/test/shaderdb/object/ObjInput_TestIndexingInterpOfInputArray_lit.frag
@@ -1,5 +1,7 @@
-#version 450 core
+// NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --function _amdgpu_ps_main
+// RUN: amdllpc --print-after=lgc-set-up-target-features %s 2>&1 | FileCheck -check-prefix=SHADERTEST %s
 
+#version 450 core
 #define ITER 5
 layout(set=0, binding=0) uniform UniformBuffer
 {
@@ -30,62 +32,245 @@ void main()
     frag_color.z = 0.0f;
     frag_color.w = 1.0f;
 }
-
-
-// BEGIN_SHADERTEST
-/*
-; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
-; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
-; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
-; SHADERTEST-LABEL: {{^// LLPC}} pipeline before-patching results
-; SHADERTEST: = call reassoc nnan nsz arcp contract afn <3 x float> @lgc.input.import.builtin.InterpPullMode.v3f32.i32(i32 268435459)
-; SHADERTEST: = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %{{.*}}, i32 245, i32 15, i32 15, i1 true)
-; SHADERTEST: = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %{{.*}}, i32 160, i32 15, i32 15, i1 true)
-; SHADERTEST: call {{.*}}float @llvm.amdgcn.wqm.f32
-; SHADERTEST: = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %{{.*}}, i32 245, i32 15, i32 15, i1 true)
-; SHADERTEST: = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %{{.*}}, i32 160, i32 15, i32 15, i1 true)
-; SHADERTEST: call {{.*}}float @llvm.amdgcn.wqm.f32
-; SHADERTEST: = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %{{.*}}, i32 245, i32 15, i32 15, i1 true)
-; SHADERTEST: = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %{{.*}}, i32 160, i32 15, i32 15, i1 true)
-; SHADERTEST: call {{.*}}float @llvm.amdgcn.wqm.f32
-; SHADERTEST: = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %{{.*}}, i32 238, i32 15, i32 15, i1 true)
-; SHADERTEST: = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %{{.*}}, i32 68, i32 15, i32 15, i1 true)
-; SHADERTEST: call {{.*}}float @llvm.amdgcn.wqm.f32
-; SHADERTEST: = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %{{.*}}, i32 238, i32 15, i32 15, i1 true)
-; SHADERTEST: = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %{{.*}}, i32 68, i32 15, i32 15, i1 true)
-; SHADERTEST: call {{.*}}float @llvm.amdgcn.wqm.f32
-; SHADERTEST: = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %{{.*}}, i32 238, i32 15, i32 15, i1 true)
-; SHADERTEST: = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %{{.*}}, i32 68, i32 15, i32 15, i1 true)
-; SHADERTEST: call {{.*}}float @llvm.amdgcn.wqm.f32
-; SHADERTEST: = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.input.import.interpolated__v4f32(i1 false, i32 4, i32 0, i32 0, i32 poison, i32 0, <2 x float>
-; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call i32 @llvm.amdgcn.mov.dpp.i32
-; SHADERTEST: call i32 @llvm.amdgcn.mov.dpp.i32
-; SHADERTEST: call {{.*}}float @llvm.amdgcn.wqm.f32
-; SHADERTEST: call i32 @llvm.amdgcn.mov.dpp.i32
-; SHADERTEST: call i32 @llvm.amdgcn.mov.dpp.i32
-; SHADERTEST: call {{.*}}float @llvm.amdgcn.wqm.f32
-; SHADERTEST: call i32 @llvm.amdgcn.mov.dpp.i32
-; SHADERTEST: call i32 @llvm.amdgcn.mov.dpp.i32
-; SHADERTEST: call {{.*}}float @llvm.amdgcn.wqm.f32
-; SHADERTEST: call i32 @llvm.amdgcn.mov.dpp.i32
-; SHADERTEST: call i32 @llvm.amdgcn.mov.dpp.i32
-; SHADERTEST: call {{.*}}float @llvm.amdgcn.wqm.f32
-; SHADERTEST: call i32 @llvm.amdgcn.mov.dpp.i32
-; SHADERTEST: call i32 @llvm.amdgcn.mov.dpp.i32
-; SHADERTEST: call {{.*}}float @llvm.amdgcn.wqm.f32
-; SHADERTEST: call i32 @llvm.amdgcn.mov.dpp.i32
-; SHADERTEST: call i32 @llvm.amdgcn.mov.dpp.i32
-; SHADERTEST: call {{.*}}float @llvm.amdgcn.wqm.f32
-; SHADERTEST: call float @llvm.amdgcn.interp.p1
-; SHADERTEST: call float @llvm.amdgcn.interp.p2
-; SHADERTEST: call float @llvm.amdgcn.interp.p1
-; SHADERTEST: call float @llvm.amdgcn.interp.p2
-; SHADERTEST: call float @llvm.amdgcn.interp.p1
-; SHADERTEST: call float @llvm.amdgcn.interp.p2
-; SHADERTEST: call float @llvm.amdgcn.interp.p1
-; SHADERTEST: call float @llvm.amdgcn.interp.p2
-; SHADERTEST-LABEL: {{^// LLPC}} final pipeline module info
-; SHADERTEST: AMDLLPC SUCCESS
-*/
-// END_SHADERTEST
+// SHADERTEST-LABEL: @_amdgpu_ps_main(
+// SHADERTEST-NEXT:  .entry:
+// SHADERTEST-NEXT:    [[PERSPINTERPPULLMODE_I2:%.*]] = extractelement <3 x float> [[PERSPINTERPPULLMODE:%.*]], i64 2
+// SHADERTEST-NEXT:    [[BC_I2:%.*]] = bitcast float [[PERSPINTERPPULLMODE_I2]] to i32
+// SHADERTEST-NEXT:    [[PERSPINTERPPULLMODE_I1:%.*]] = extractelement <3 x float> [[PERSPINTERPPULLMODE]], i64 1
+// SHADERTEST-NEXT:    [[BC_I1:%.*]] = bitcast float [[PERSPINTERPPULLMODE_I1]] to i32
+// SHADERTEST-NEXT:    [[PERSPINTERPPULLMODE_I0:%.*]] = extractelement <3 x float> [[PERSPINTERPPULLMODE]], i64 0
+// SHADERTEST-NEXT:    [[BC_I0:%.*]] = bitcast float [[PERSPINTERPPULLMODE_I0]] to i32
+// SHADERTEST-NEXT:    [[PERSPINTERPCENTER_I0:%.*]] = extractelement <2 x float> [[PERSPINTERPCENTER:%.*]], i64 0
+// SHADERTEST-NEXT:    [[PERSPINTERPCENTER_I1:%.*]] = extractelement <2 x float> [[PERSPINTERPCENTER]], i64 1
+// SHADERTEST-NEXT:    [[TMP0:%.*]] = fdiv reassoc nnan nsz arcp contract afn float 1.000000e+00, [[PERSPINTERPPULLMODE_I2]]
+// SHADERTEST-NEXT:    [[DOTI0:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[TMP0]], [[PERSPINTERPPULLMODE_I0]]
+// SHADERTEST-NEXT:    [[DOTI1:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[TMP0]], [[PERSPINTERPPULLMODE_I1]]
+// SHADERTEST-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.interp.p1(float [[DOTI0]], i32 0, i32 0, i32 [[PRIMMASK:%.*]])
+// SHADERTEST-NEXT:    [[TMP2:%.*]] = call float @llvm.amdgcn.interp.p2(float [[TMP1]], float [[DOTI1]], i32 0, i32 0, i32 [[PRIMMASK]])
+// SHADERTEST-NEXT:    [[TMP3:%.*]] = call float @llvm.amdgcn.interp.p1(float [[DOTI0]], i32 1, i32 0, i32 [[PRIMMASK]])
+// SHADERTEST-NEXT:    [[TMP4:%.*]] = call float @llvm.amdgcn.interp.p2(float [[TMP3]], float [[DOTI1]], i32 1, i32 0, i32 [[PRIMMASK]])
+// SHADERTEST-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I0]], i32 245, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP6:%.*]] = bitcast i32 [[TMP5]] to float
+// SHADERTEST-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I0]], i32 160, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP8:%.*]] = bitcast i32 [[TMP7]] to float
+// SHADERTEST-NEXT:    [[TMP9:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[TMP6]], [[TMP8]]
+// SHADERTEST-NEXT:    [[TMP10:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.wqm.f32(float [[TMP9]])
+// SHADERTEST-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I1]], i32 245, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP12:%.*]] = bitcast i32 [[TMP11]] to float
+// SHADERTEST-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I1]], i32 160, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP14:%.*]] = bitcast i32 [[TMP13]] to float
+// SHADERTEST-NEXT:    [[TMP15:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[TMP12]], [[TMP14]]
+// SHADERTEST-NEXT:    [[TMP16:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.wqm.f32(float [[TMP15]])
+// SHADERTEST-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I2]], i32 245, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
+// SHADERTEST-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I2]], i32 160, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
+// SHADERTEST-NEXT:    [[TMP21:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[TMP18]], [[TMP20]]
+// SHADERTEST-NEXT:    [[TMP22:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.wqm.f32(float [[TMP21]])
+// SHADERTEST-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I0]], i32 238, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
+// SHADERTEST-NEXT:    [[TMP25:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I0]], i32 68, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP26:%.*]] = bitcast i32 [[TMP25]] to float
+// SHADERTEST-NEXT:    [[TMP27:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[TMP24]], [[TMP26]]
+// SHADERTEST-NEXT:    [[TMP28:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.wqm.f32(float [[TMP27]])
+// SHADERTEST-NEXT:    [[TMP29:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I1]], i32 238, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP30:%.*]] = bitcast i32 [[TMP29]] to float
+// SHADERTEST-NEXT:    [[TMP31:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I1]], i32 68, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP32:%.*]] = bitcast i32 [[TMP31]] to float
+// SHADERTEST-NEXT:    [[TMP33:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[TMP30]], [[TMP32]]
+// SHADERTEST-NEXT:    [[TMP34:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.wqm.f32(float [[TMP33]])
+// SHADERTEST-NEXT:    [[TMP35:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I2]], i32 238, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP36:%.*]] = bitcast i32 [[TMP35]] to float
+// SHADERTEST-NEXT:    [[TMP37:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I2]], i32 68, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP38:%.*]] = bitcast i32 [[TMP37]] to float
+// SHADERTEST-NEXT:    [[TMP39:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[TMP36]], [[TMP38]]
+// SHADERTEST-NEXT:    [[TMP40:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.wqm.f32(float [[TMP39]])
+// SHADERTEST-NEXT:    [[REASS_ADD_1_I0:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[TMP28]], [[TMP10]]
+// SHADERTEST-NEXT:    [[REASS_ADD_1_I1:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[TMP34]], [[TMP16]]
+// SHADERTEST-NEXT:    [[REASS_ADD_1_I2:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[TMP40]], [[TMP22]]
+// SHADERTEST-NEXT:    [[REASS_MUL_1_I0:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[REASS_ADD_1_I0]], 0x3FC99999A0000000
+// SHADERTEST-NEXT:    [[REASS_MUL_1_I1:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[REASS_ADD_1_I1]], 0x3FC99999A0000000
+// SHADERTEST-NEXT:    [[REASS_MUL_1_I2:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[REASS_ADD_1_I2]], 0x3FC99999A0000000
+// SHADERTEST-NEXT:    [[DOTI060:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[REASS_MUL_1_I0]], [[PERSPINTERPPULLMODE_I0]]
+// SHADERTEST-NEXT:    [[DOTI161:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[REASS_MUL_1_I1]], [[PERSPINTERPPULLMODE_I1]]
+// SHADERTEST-NEXT:    [[DOTI2:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[REASS_MUL_1_I2]], [[PERSPINTERPPULLMODE_I2]]
+// SHADERTEST-NEXT:    [[TMP41:%.*]] = fdiv reassoc nnan nsz arcp contract afn float 1.000000e+00, [[DOTI2]]
+// SHADERTEST-NEXT:    [[DOTI062:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[TMP41]], [[DOTI060]]
+// SHADERTEST-NEXT:    [[DOTI163:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[TMP41]], [[DOTI161]]
+// SHADERTEST-NEXT:    [[TMP42:%.*]] = call float @llvm.amdgcn.interp.p1(float [[DOTI062]], i32 0, i32 1, i32 [[PRIMMASK]])
+// SHADERTEST-NEXT:    [[TMP43:%.*]] = call float @llvm.amdgcn.interp.p2(float [[TMP42]], float [[DOTI163]], i32 0, i32 1, i32 [[PRIMMASK]])
+// SHADERTEST-NEXT:    [[TMP44:%.*]] = call float @llvm.amdgcn.interp.p1(float [[DOTI062]], i32 1, i32 1, i32 [[PRIMMASK]])
+// SHADERTEST-NEXT:    [[TMP45:%.*]] = call float @llvm.amdgcn.interp.p2(float [[TMP44]], float [[DOTI163]], i32 1, i32 1, i32 [[PRIMMASK]])
+// SHADERTEST-NEXT:    [[DOTI064:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[TMP43]], [[TMP2]]
+// SHADERTEST-NEXT:    [[DOTI165:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[TMP45]], [[TMP4]]
+// SHADERTEST-NEXT:    [[TMP46:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I0]], i32 245, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP47:%.*]] = bitcast i32 [[TMP46]] to float
+// SHADERTEST-NEXT:    [[TMP48:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I0]], i32 160, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP49:%.*]] = bitcast i32 [[TMP48]] to float
+// SHADERTEST-NEXT:    [[TMP50:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[TMP47]], [[TMP49]]
+// SHADERTEST-NEXT:    [[TMP51:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.wqm.f32(float [[TMP50]])
+// SHADERTEST-NEXT:    [[TMP52:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I1]], i32 245, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP53:%.*]] = bitcast i32 [[TMP52]] to float
+// SHADERTEST-NEXT:    [[TMP54:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I1]], i32 160, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP55:%.*]] = bitcast i32 [[TMP54]] to float
+// SHADERTEST-NEXT:    [[TMP56:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[TMP53]], [[TMP55]]
+// SHADERTEST-NEXT:    [[TMP57:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.wqm.f32(float [[TMP56]])
+// SHADERTEST-NEXT:    [[TMP58:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I2]], i32 245, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP59:%.*]] = bitcast i32 [[TMP58]] to float
+// SHADERTEST-NEXT:    [[TMP60:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I2]], i32 160, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP61:%.*]] = bitcast i32 [[TMP60]] to float
+// SHADERTEST-NEXT:    [[TMP62:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[TMP59]], [[TMP61]]
+// SHADERTEST-NEXT:    [[TMP63:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.wqm.f32(float [[TMP62]])
+// SHADERTEST-NEXT:    [[TMP64:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I0]], i32 238, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP65:%.*]] = bitcast i32 [[TMP64]] to float
+// SHADERTEST-NEXT:    [[TMP66:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I0]], i32 68, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP67:%.*]] = bitcast i32 [[TMP66]] to float
+// SHADERTEST-NEXT:    [[TMP68:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[TMP65]], [[TMP67]]
+// SHADERTEST-NEXT:    [[TMP69:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.wqm.f32(float [[TMP68]])
+// SHADERTEST-NEXT:    [[TMP70:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I1]], i32 238, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP71:%.*]] = bitcast i32 [[TMP70]] to float
+// SHADERTEST-NEXT:    [[TMP72:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I1]], i32 68, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP73:%.*]] = bitcast i32 [[TMP72]] to float
+// SHADERTEST-NEXT:    [[TMP74:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[TMP71]], [[TMP73]]
+// SHADERTEST-NEXT:    [[TMP75:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.wqm.f32(float [[TMP74]])
+// SHADERTEST-NEXT:    [[TMP76:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I2]], i32 238, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP77:%.*]] = bitcast i32 [[TMP76]] to float
+// SHADERTEST-NEXT:    [[TMP78:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I2]], i32 68, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP79:%.*]] = bitcast i32 [[TMP78]] to float
+// SHADERTEST-NEXT:    [[TMP80:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[TMP77]], [[TMP79]]
+// SHADERTEST-NEXT:    [[TMP81:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.wqm.f32(float [[TMP80]])
+// SHADERTEST-NEXT:    [[REASS_ADD_2_I0:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[TMP69]], [[TMP51]]
+// SHADERTEST-NEXT:    [[REASS_ADD_2_I1:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[TMP75]], [[TMP57]]
+// SHADERTEST-NEXT:    [[REASS_ADD_2_I2:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[TMP81]], [[TMP63]]
+// SHADERTEST-NEXT:    [[REASS_MUL_2_I0:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[REASS_ADD_2_I0]], 0x3FD99999A0000000
+// SHADERTEST-NEXT:    [[REASS_MUL_2_I1:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[REASS_ADD_2_I1]], 0x3FD99999A0000000
+// SHADERTEST-NEXT:    [[REASS_MUL_2_I2:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[REASS_ADD_2_I2]], 0x3FD99999A0000000
+// SHADERTEST-NEXT:    [[DOTI067:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[REASS_MUL_2_I0]], [[PERSPINTERPPULLMODE_I0]]
+// SHADERTEST-NEXT:    [[DOTI168:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[REASS_MUL_2_I1]], [[PERSPINTERPPULLMODE_I1]]
+// SHADERTEST-NEXT:    [[DOTI269:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[REASS_MUL_2_I2]], [[PERSPINTERPPULLMODE_I2]]
+// SHADERTEST-NEXT:    [[TMP82:%.*]] = fdiv reassoc nnan nsz arcp contract afn float 1.000000e+00, [[DOTI269]]
+// SHADERTEST-NEXT:    [[DOTI070:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[TMP82]], [[DOTI067]]
+// SHADERTEST-NEXT:    [[DOTI171:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[TMP82]], [[DOTI168]]
+// SHADERTEST-NEXT:    [[TMP83:%.*]] = call float @llvm.amdgcn.interp.p1(float [[DOTI070]], i32 0, i32 2, i32 [[PRIMMASK]])
+// SHADERTEST-NEXT:    [[TMP84:%.*]] = call float @llvm.amdgcn.interp.p2(float [[TMP83]], float [[DOTI171]], i32 0, i32 2, i32 [[PRIMMASK]])
+// SHADERTEST-NEXT:    [[TMP85:%.*]] = call float @llvm.amdgcn.interp.p1(float [[DOTI070]], i32 1, i32 2, i32 [[PRIMMASK]])
+// SHADERTEST-NEXT:    [[TMP86:%.*]] = call float @llvm.amdgcn.interp.p2(float [[TMP85]], float [[DOTI171]], i32 1, i32 2, i32 [[PRIMMASK]])
+// SHADERTEST-NEXT:    [[DOTI072:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[TMP84]], [[DOTI064]]
+// SHADERTEST-NEXT:    [[DOTI173:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[TMP86]], [[DOTI165]]
+// SHADERTEST-NEXT:    [[TMP87:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I0]], i32 245, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP88:%.*]] = bitcast i32 [[TMP87]] to float
+// SHADERTEST-NEXT:    [[TMP89:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I0]], i32 160, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP90:%.*]] = bitcast i32 [[TMP89]] to float
+// SHADERTEST-NEXT:    [[TMP91:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[TMP88]], [[TMP90]]
+// SHADERTEST-NEXT:    [[TMP92:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.wqm.f32(float [[TMP91]])
+// SHADERTEST-NEXT:    [[TMP93:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I1]], i32 245, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP94:%.*]] = bitcast i32 [[TMP93]] to float
+// SHADERTEST-NEXT:    [[TMP95:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I1]], i32 160, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP96:%.*]] = bitcast i32 [[TMP95]] to float
+// SHADERTEST-NEXT:    [[TMP97:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[TMP94]], [[TMP96]]
+// SHADERTEST-NEXT:    [[TMP98:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.wqm.f32(float [[TMP97]])
+// SHADERTEST-NEXT:    [[TMP99:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I2]], i32 245, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP100:%.*]] = bitcast i32 [[TMP99]] to float
+// SHADERTEST-NEXT:    [[TMP101:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I2]], i32 160, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP102:%.*]] = bitcast i32 [[TMP101]] to float
+// SHADERTEST-NEXT:    [[TMP103:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[TMP100]], [[TMP102]]
+// SHADERTEST-NEXT:    [[TMP104:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.wqm.f32(float [[TMP103]])
+// SHADERTEST-NEXT:    [[TMP105:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I0]], i32 238, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP106:%.*]] = bitcast i32 [[TMP105]] to float
+// SHADERTEST-NEXT:    [[TMP107:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I0]], i32 68, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP108:%.*]] = bitcast i32 [[TMP107]] to float
+// SHADERTEST-NEXT:    [[TMP109:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[TMP106]], [[TMP108]]
+// SHADERTEST-NEXT:    [[TMP110:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.wqm.f32(float [[TMP109]])
+// SHADERTEST-NEXT:    [[TMP111:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I1]], i32 238, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP112:%.*]] = bitcast i32 [[TMP111]] to float
+// SHADERTEST-NEXT:    [[TMP113:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I1]], i32 68, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP114:%.*]] = bitcast i32 [[TMP113]] to float
+// SHADERTEST-NEXT:    [[TMP115:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[TMP112]], [[TMP114]]
+// SHADERTEST-NEXT:    [[TMP116:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.wqm.f32(float [[TMP115]])
+// SHADERTEST-NEXT:    [[TMP117:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I2]], i32 238, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP118:%.*]] = bitcast i32 [[TMP117]] to float
+// SHADERTEST-NEXT:    [[TMP119:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I2]], i32 68, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP120:%.*]] = bitcast i32 [[TMP119]] to float
+// SHADERTEST-NEXT:    [[TMP121:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[TMP118]], [[TMP120]]
+// SHADERTEST-NEXT:    [[TMP122:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.wqm.f32(float [[TMP121]])
+// SHADERTEST-NEXT:    [[REASS_ADD_3_I0:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[TMP110]], [[TMP92]]
+// SHADERTEST-NEXT:    [[REASS_ADD_3_I1:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[TMP116]], [[TMP98]]
+// SHADERTEST-NEXT:    [[REASS_ADD_3_I2:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[TMP122]], [[TMP104]]
+// SHADERTEST-NEXT:    [[REASS_MUL_3_I0:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[REASS_ADD_3_I0]], 0x3FE3333340000000
+// SHADERTEST-NEXT:    [[REASS_MUL_3_I1:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[REASS_ADD_3_I1]], 0x3FE3333340000000
+// SHADERTEST-NEXT:    [[REASS_MUL_3_I2:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[REASS_ADD_3_I2]], 0x3FE3333340000000
+// SHADERTEST-NEXT:    [[DOTI076:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[REASS_MUL_3_I0]], [[PERSPINTERPPULLMODE_I0]]
+// SHADERTEST-NEXT:    [[DOTI177:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[REASS_MUL_3_I1]], [[PERSPINTERPPULLMODE_I1]]
+// SHADERTEST-NEXT:    [[DOTI278:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[REASS_MUL_3_I2]], [[PERSPINTERPPULLMODE_I2]]
+// SHADERTEST-NEXT:    [[TMP123:%.*]] = fdiv reassoc nnan nsz arcp contract afn float 1.000000e+00, [[DOTI278]]
+// SHADERTEST-NEXT:    [[DOTI079:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[TMP123]], [[DOTI076]]
+// SHADERTEST-NEXT:    [[DOTI180:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[TMP123]], [[DOTI177]]
+// SHADERTEST-NEXT:    [[TMP124:%.*]] = call float @llvm.amdgcn.interp.p1(float [[DOTI079]], i32 0, i32 3, i32 [[PRIMMASK]])
+// SHADERTEST-NEXT:    [[TMP125:%.*]] = call float @llvm.amdgcn.interp.p2(float [[TMP124]], float [[DOTI180]], i32 0, i32 3, i32 [[PRIMMASK]])
+// SHADERTEST-NEXT:    [[TMP126:%.*]] = call float @llvm.amdgcn.interp.p1(float [[DOTI079]], i32 1, i32 3, i32 [[PRIMMASK]])
+// SHADERTEST-NEXT:    [[TMP127:%.*]] = call float @llvm.amdgcn.interp.p2(float [[TMP126]], float [[DOTI180]], i32 1, i32 3, i32 [[PRIMMASK]])
+// SHADERTEST-NEXT:    [[DOTI081:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[TMP125]], [[DOTI072]]
+// SHADERTEST-NEXT:    [[DOTI182:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[TMP127]], [[DOTI173]]
+// SHADERTEST-NEXT:    [[TMP128:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I0]], i32 245, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP129:%.*]] = bitcast i32 [[TMP128]] to float
+// SHADERTEST-NEXT:    [[TMP130:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I0]], i32 160, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP131:%.*]] = bitcast i32 [[TMP130]] to float
+// SHADERTEST-NEXT:    [[TMP132:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[TMP129]], [[TMP131]]
+// SHADERTEST-NEXT:    [[TMP133:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.wqm.f32(float [[TMP132]])
+// SHADERTEST-NEXT:    [[TMP134:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I1]], i32 245, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP135:%.*]] = bitcast i32 [[TMP134]] to float
+// SHADERTEST-NEXT:    [[TMP136:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I1]], i32 160, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP137:%.*]] = bitcast i32 [[TMP136]] to float
+// SHADERTEST-NEXT:    [[TMP138:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[TMP135]], [[TMP137]]
+// SHADERTEST-NEXT:    [[TMP139:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.wqm.f32(float [[TMP138]])
+// SHADERTEST-NEXT:    [[TMP140:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I2]], i32 245, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP141:%.*]] = bitcast i32 [[TMP140]] to float
+// SHADERTEST-NEXT:    [[TMP142:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I2]], i32 160, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP143:%.*]] = bitcast i32 [[TMP142]] to float
+// SHADERTEST-NEXT:    [[TMP144:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[TMP141]], [[TMP143]]
+// SHADERTEST-NEXT:    [[TMP145:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.wqm.f32(float [[TMP144]])
+// SHADERTEST-NEXT:    [[TMP146:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I0]], i32 238, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP147:%.*]] = bitcast i32 [[TMP146]] to float
+// SHADERTEST-NEXT:    [[TMP148:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I0]], i32 68, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP149:%.*]] = bitcast i32 [[TMP148]] to float
+// SHADERTEST-NEXT:    [[TMP150:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[TMP147]], [[TMP149]]
+// SHADERTEST-NEXT:    [[TMP151:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.wqm.f32(float [[TMP150]])
+// SHADERTEST-NEXT:    [[TMP152:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I1]], i32 238, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP153:%.*]] = bitcast i32 [[TMP152]] to float
+// SHADERTEST-NEXT:    [[TMP154:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I1]], i32 68, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP155:%.*]] = bitcast i32 [[TMP154]] to float
+// SHADERTEST-NEXT:    [[TMP156:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[TMP153]], [[TMP155]]
+// SHADERTEST-NEXT:    [[TMP157:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.wqm.f32(float [[TMP156]])
+// SHADERTEST-NEXT:    [[TMP158:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I2]], i32 238, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP159:%.*]] = bitcast i32 [[TMP158]] to float
+// SHADERTEST-NEXT:    [[TMP160:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[BC_I2]], i32 68, i32 15, i32 15, i1 true)
+// SHADERTEST-NEXT:    [[TMP161:%.*]] = bitcast i32 [[TMP160]] to float
+// SHADERTEST-NEXT:    [[TMP162:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[TMP159]], [[TMP161]]
+// SHADERTEST-NEXT:    [[TMP163:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.wqm.f32(float [[TMP162]])
+// SHADERTEST-NEXT:    [[REASS_ADD_4_I0:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[TMP151]], [[TMP133]]
+// SHADERTEST-NEXT:    [[REASS_ADD_4_I1:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[TMP157]], [[TMP139]]
+// SHADERTEST-NEXT:    [[REASS_ADD_4_I2:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[TMP163]], [[TMP145]]
+// SHADERTEST-NEXT:    [[REASS_MUL_4_I0:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[REASS_ADD_4_I0]], 0x3FE99999A0000000
+// SHADERTEST-NEXT:    [[REASS_MUL_4_I1:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[REASS_ADD_4_I1]], 0x3FE99999A0000000
+// SHADERTEST-NEXT:    [[REASS_MUL_4_I2:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[REASS_ADD_4_I2]], 0x3FE99999A0000000
+// SHADERTEST-NEXT:    [[DOTI085:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[REASS_MUL_4_I0]], [[PERSPINTERPPULLMODE_I0]]
+// SHADERTEST-NEXT:    [[DOTI186:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[REASS_MUL_4_I1]], [[PERSPINTERPPULLMODE_I1]]
+// SHADERTEST-NEXT:    [[DOTI287:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[REASS_MUL_4_I2]], [[PERSPINTERPPULLMODE_I2]]
+// SHADERTEST-NEXT:    [[TMP164:%.*]] = fdiv reassoc nnan nsz arcp contract afn float 1.000000e+00, [[DOTI287]]
+// SHADERTEST-NEXT:    [[DOTI088:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[TMP164]], [[DOTI085]]
+// SHADERTEST-NEXT:    [[DOTI189:%.*]] = fmul reassoc nnan nsz arcp contract afn float [[TMP164]], [[DOTI186]]
+// SHADERTEST-NEXT:    [[TMP165:%.*]] = call float @llvm.amdgcn.interp.p1(float [[DOTI088]], i32 0, i32 4, i32 [[PRIMMASK]])
+// SHADERTEST-NEXT:    [[TMP166:%.*]] = call float @llvm.amdgcn.interp.p2(float [[TMP165]], float [[DOTI189]], i32 0, i32 4, i32 [[PRIMMASK]])
+// SHADERTEST-NEXT:    [[TMP167:%.*]] = call float @llvm.amdgcn.interp.p1(float [[DOTI088]], i32 1, i32 4, i32 [[PRIMMASK]])
+// SHADERTEST-NEXT:    [[TMP168:%.*]] = call float @llvm.amdgcn.interp.p2(float [[TMP167]], float [[DOTI189]], i32 1, i32 4, i32 [[PRIMMASK]])
+// SHADERTEST-NEXT:    [[DOTI090:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[TMP166]], [[DOTI081]]
+// SHADERTEST-NEXT:    [[DOTI191:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[TMP168]], [[DOTI182]]
+// SHADERTEST-NEXT:    [[TMP169:%.*]] = call float @llvm.amdgcn.interp.p1(float [[PERSPINTERPCENTER_I0]], i32 0, i32 0, i32 [[PRIMMASK]])
+// SHADERTEST-NEXT:    [[TMP170:%.*]] = call float @llvm.amdgcn.interp.p2(float [[TMP169]], float [[PERSPINTERPCENTER_I1]], i32 0, i32 0, i32 [[PRIMMASK]])
+// SHADERTEST-NEXT:    [[TMP171:%.*]] = call float @llvm.amdgcn.interp.p1(float [[PERSPINTERPCENTER_I0]], i32 1, i32 0, i32 [[PRIMMASK]])
+// SHADERTEST-NEXT:    [[TMP172:%.*]] = call float @llvm.amdgcn.interp.p2(float [[TMP171]], float [[PERSPINTERPCENTER_I1]], i32 1, i32 0, i32 [[PRIMMASK]])
+// SHADERTEST-NEXT:    [[TMP173:%.*]] = fsub reassoc nnan nsz arcp contract afn float [[DOTI090]], [[TMP170]]
+// SHADERTEST-NEXT:    [[TMP174:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[DOTI191]], [[TMP172]]
+// SHADERTEST-NEXT:    call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float [[TMP173]], float [[TMP174]], float 0.000000e+00, float 1.000000e+00, i1 true, i1 true)
+// SHADERTEST-NEXT:    ret void
+//
diff --git a/llpc/test/shaderdb/object/ObjStorageBlock_TestRowMajor_lit.frag b/llpc/test/shaderdb/object/ObjStorageBlock_TestRowMajor_lit.frag
index 0658475e2d..c94006f2ca 100644
--- a/llpc/test/shaderdb/object/ObjStorageBlock_TestRowMajor_lit.frag
+++ b/llpc/test/shaderdb/object/ObjStorageBlock_TestRowMajor_lit.frag
@@ -25,11 +25,11 @@ void main()
 // SHADERTEST-NEXT:  .entry:
 // SHADERTEST-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 0, i32 0, i32 2)
 // SHADERTEST-NEXT:    store float 1.000000e+00, ptr addrspace(7) [[TMP0]], align 4
-// SHADERTEST-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(7) [[TMP0]], i32 16
+// SHADERTEST-NEXT:    [[TMP1:%.*]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(7) [[TMP0]], i32 16
 // SHADERTEST-NEXT:    store float 1.000000e+00, ptr addrspace(7) [[TMP1]], align 4
-// SHADERTEST-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(7) [[TMP0]], i32 32
+// SHADERTEST-NEXT:    [[TMP2:%.*]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(7) [[TMP0]], i32 32
 // SHADERTEST-NEXT:    store float 1.000000e+00, ptr addrspace(7) [[TMP2]], align 4
-// SHADERTEST-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(7) [[TMP0]], i32 48
+// SHADERTEST-NEXT:    [[TMP3:%.*]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(7) [[TMP0]], i32 48
 // SHADERTEST-NEXT:    store float 1.000000e+00, ptr addrspace(7) [[TMP3]], align 4
 // SHADERTEST-NEXT:    [[TMP4:%.*]] = load float, ptr addrspace(7) [[TMP0]], align 16
 // SHADERTEST-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> poison, float [[TMP4]], i64 0
diff --git a/llpc/test/shaderdb/object/ObjStorageBlock_TestRuntimeArray_lit.vert b/llpc/test/shaderdb/object/ObjStorageBlock_TestRuntimeArray_lit.vert
index 4776e6fad3..94ef5423e1 100644
--- a/llpc/test/shaderdb/object/ObjStorageBlock_TestRuntimeArray_lit.vert
+++ b/llpc/test/shaderdb/object/ObjStorageBlock_TestRuntimeArray_lit.vert
@@ -1,3 +1,5 @@
+// NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py
+// RUN: amdllpc --print-after=lower-translator -filetype=asm -o - 2>&1 %s | FileCheck -check-prefixes=SHADERTEST %s
 #version 450 core
 
 layout(std430, binding = 0) buffer Block
@@ -6,21 +8,38 @@ layout(std430, binding = 0) buffer Block
     vec4   o[];
 } block;
 
+layout(set = 1, binding = 0, row_major) buffer Block2
+{
+    mat4x2 m4x2;
+    float f[];
+} data[3];
+
 void main()
 {
     block.o[gl_VertexIndex] = block.base + vec4(gl_VertexIndex);
 
-    gl_Position = vec4(1.0);
+    gl_Position = vec4(data[1].m4x2[0][0], 0.0, 0.0, 1.0);
 }
-// BEGIN_SHADERTEST
-/*
-; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
-; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
-; SHADERTEST: getelementptr (<{ [4 x float], [4294967295 x [4 x float]] }>, ptr addrspace({{.*}}) @{{.*}}, i32 0, i32 1)
-
-; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
-; SHADERTEST: store <4 x float>
 
-; SHADERTEST: AMDLLPC SUCCESS
-*/
-// END_SHADERTEST
+// SHADERTEST-LABEL: @main(
+// SHADERTEST-NEXT:  .entry:
+// SHADERTEST-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(64) @gl_VertexIndex, align 4
+// SHADERTEST-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr addrspace(7) @block, align 16
+// SHADERTEST-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(64) @gl_VertexIndex, align 4
+// SHADERTEST-NEXT:    [[TMP3:%.*]] = sitofp i32 [[TMP2]] to float
+// SHADERTEST-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0
+// SHADERTEST-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP3]], i32 1
+// SHADERTEST-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[TMP3]], i32 2
+// SHADERTEST-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP3]], i32 3
+// SHADERTEST-NEXT:    [[TMP8:%.*]] = fadd reassoc nnan nsz arcp contract afn <4 x float> [[TMP1]], [[TMP7]]
+// SHADERTEST-NEXT:    [[TMP9:%.*]] = call ptr addrspace(7) @lgc.buffer.index(ptr addrspace(7) getelementptr (<{ [4 x float], [0 x [4 x float]] }>, ptr addrspace(7) @block, i32 0, i32 1), i32 16, i32 [[TMP0]])
+// SHADERTEST-NEXT:    store <4 x float> [[TMP8]], ptr addrspace(7) [[TMP9]], align 16
+// SHADERTEST-NEXT:    [[TMP10:%.*]] = load float, ptr addrspace(7) getelementptr ([3 x <{ [2 x %llpc.matrix.row], [0 x float] }>], ptr addrspace(7) @data, i32 0, i32 1, i32 0), align 4
+// SHADERTEST-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> poison, float [[TMP10]], i32 0
+// SHADERTEST-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float 0.000000e+00, i32 1
+// SHADERTEST-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float 0.000000e+00, i32 2
+// SHADERTEST-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float 1.000000e+00, i32 3
+// SHADERTEST-NEXT:    [[TMP15:%.*]] = call ptr addrspace(65) (...) @llpc.structural.gep__p65(ptr addrspace(65) @[[GLOB0:[0-9]+]], { <4 x float>, float, [1 x float], [1 x float] } poison, i1 false, i32 0, i32 0)
+// SHADERTEST-NEXT:    store <4 x float> [[TMP14]], ptr addrspace(65) [[TMP15]], align 16
+// SHADERTEST-NEXT:    ret void
+//
diff --git a/llpc/test/shaderdb/object/ObjXfb_TestBasic_lit.vert b/llpc/test/shaderdb/object/ObjXfb_TestBasic_lit.vert
index a6582f7697..cd73292ff0 100644
--- a/llpc/test/shaderdb/object/ObjXfb_TestBasic_lit.vert
+++ b/llpc/test/shaderdb/object/ObjXfb_TestBasic_lit.vert
@@ -14,7 +14,7 @@ void main()
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
-; SHADERTEST: call void @lgc.output.export.xfb.i32.i32.i32.v4f32(i32 0, i32 16, i32 0, <4 x float> {{(splat \(float 2\.000000e\+00\))|(<float 2\.000000e\+00, float 2\.000000e\+00, float 2\.000000e\+00, float 2\.000000e\+00>)}}
+; SHADERTEST: call void (...) @lgc.write.xfb.output(i32 0, i32 16, i32 0, <4 x float> {{(splat \(float 2\.000000e\+00\))|(<float 2\.000000e\+00, float 2\.000000e\+00, float 2\.000000e\+00, float 2\.000000e\+00>)}}
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/ray_tracing/PipelineRayquery.pipe b/llpc/test/shaderdb/ray_tracing/PipelineRayquery.pipe
index fd929834c5..aa60416fc8 100644
--- a/llpc/test/shaderdb/ray_tracing/PipelineRayquery.pipe
+++ b/llpc/test/shaderdb/ray_tracing/PipelineRayquery.pipe
@@ -260,11 +260,11 @@ rtState.rtIpOverride = 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP8]], [[__LLPC_INPUT_PROXY_GL_GLOBALINVOCATIONID_0_VEC_EXTRACT]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call ptr addrspace(7) @lgc.buffer.index(ptr addrspace(7) [[TMP3]], i32 32, i32 [[TMP9]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <3 x float>, ptr addrspace(7) [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(7) [[TMP10]], i32 12
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(7) [[TMP10]], i32 12
 ; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr addrspace(7) [[TMP12]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(7) [[TMP10]], i32 16
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(7) [[TMP10]], i32 16
 ; CHECK-NEXT:    [[TMP15:%.*]] = load <3 x float>, ptr addrspace(7) [[TMP14]], align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(7) [[TMP10]], i32 28
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(7) [[TMP10]], i32 28
 ; CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr addrspace(7) [[TMP16]], align 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = call ptr addrspace(5) (...) @lgc.rtq.gep.opaque([3 x i127] poison, i1 false, ptr addrspace(5) [[__LLPC_GLOBAL_PROXY_Q]], i32 0, i32 2)
 ; CHECK-NEXT:    [[TMP19:%.*]] = load <2 x i32>, ptr addrspace(4) [[TMP1]], align 8
diff --git a/llpc/test/shaderdb/ray_tracing/PipelineRays_Continuations.pipe b/llpc/test/shaderdb/ray_tracing/PipelineRays_Continuations.pipe
index 09db5cfe81..146adb0a2c 100644
--- a/llpc/test/shaderdb/ray_tracing/PipelineRays_Continuations.pipe
+++ b/llpc/test/shaderdb/ray_tracing/PipelineRays_Continuations.pipe
@@ -4,27 +4,39 @@
 ; TODO: Change this to ISA / assembly output checks once the LLVM backend has settled
 
 ; RUN: amdllpc -gfxip 11.0 -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK %s
+; RUN: amdllpc -gfxip 11.0 -filetype=asm -add-rt-helpers 1 -o - %s | FileCheck -check-prefixes=ASM %s
 
 ; CHECK-LABEL: @_amdgpu_cs_main(
 ; CHECK:     call void {{.*}} @llvm.amdgcn.cs.chain.
 
+; ASM: {{^}}_amdgpu_cs_main:
+
 ; CHECK-LABEL: @_rgen_1(
 ; CHECK:     call void {{.*}} @llvm.amdgcn.cs.chain.
 
+; ASM: {{^}}_rgen_1:
+
 ; CHECK-LABEL: @_rgen_1.resume.0(
 ; CHECK:     call void {{.*}} @llvm.amdgcn.cs.chain.
 ; CHECK:     unreachable
 ; CHECK:     ret void
 
+; ASM: {{^}}_rgen_1.resume.0:
+
 ; CHECK-LABEL: @_chit_2(
 ; CHECK:     call void {{.*}} @llvm.amdgcn.cs.chain.
 
+; ASM: {{^}}_chit_2:
+
 ; CHECK-LABEL: @_cs_(
 ; CHECK:     call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.
 ; CHECK-NOT: ret void
 ; CHECK:     call void {{.*}} @llvm.amdgcn.cs.chain.
 ; CHECK-NOT: ret void
 
+; ASM: {{^}}_cs_:
+; ASM:       image_bvh64_intersect_ray
+
 [Version]
 version = 69
 
diff --git a/llpc/test/shaderdb/ray_tracing/PipelineRays_Continuations_IntersectionShaderVgprCount.pipe b/llpc/test/shaderdb/ray_tracing/PipelineRays_Continuations_IntersectionShaderVgprCount.pipe
new file mode 100644
index 0000000000..dc0b9e2af0
--- /dev/null
+++ b/llpc/test/shaderdb/ray_tracing/PipelineRays_Continuations_IntersectionShaderVgprCount.pipe
@@ -0,0 +1,336 @@
+; Check that the payload register count for intersection shader is reasonable
+
+; RUN: amdllpc -gfxip=11.0 --report-payload-register-sizes=max %s 2>&1 | FileCheck -check-prefix=CHECK-FROM-LIB %s
+; CHECK-FROM-LIB: Incoming and max outgoing payload VGPR size of "_sect_4" (intersection): 4 and 4 dwords
+
+; RUN: grep -v "payloadSizeMaxInLib" %s > %t.pipe
+; RUN: amdllpc -gfxip=11.0 --report-payload-register-sizes=max %t.pipe 2>&1 | FileCheck -check-prefix=CHECK-FROM-PIPE %s
+; CHECK-FROM-PIPE: Incoming and max outgoing payload VGPR size of "_sect_4" (intersection): 3 and 3 dwords
+
+[Version]
+version = 53
+
+[rgenGlsl]
+#version 460
+#extension GL_EXT_ray_tracing : enable
+
+layout(binding = 0, set = 0) uniform accelerationStructureEXT g_bvh;
+layout(binding = 1, set = 0, rgba32f) uniform image2D g_dst;
+
+layout(location = 0) rayPayloadEXT vec3 g_ray;
+
+void main() {
+  vec3 origin;
+  origin.x = gl_LaunchIDEXT.x;
+  origin.y = gl_LaunchIDEXT.y;
+  origin.z = 0;
+
+  traceRayEXT(g_bvh, /* ray flags */ 0, /* cull mask */ 0xff, 
+              /* sbt offset */ 0, /* sbt stride */ 1, /* miss index */ 0,
+              origin.xyz, /* tmin */ 0.0, /* direction */ vec3(1, 0, 0),
+              /* tmax */ 48.0, /* payload location */ 0);
+
+  imageStore(g_dst, ivec2(gl_LaunchIDEXT.xy), vec4(g_ray, 0));
+}
+
+[rgenInfo]
+entryPoint = main
+options.trapPresent = 0
+options.debugMode = 0
+options.enablePerformanceData = 0
+options.allowReZ = 0
+options.vgprLimit = 0
+options.sgprLimit = 0
+options.maxThreadGroupsPerComputeUnit = 0
+options.waveSize = 32
+options.wgpMode = 0
+options.waveBreakSize = None
+options.forceLoopUnrollCount = 0
+options.useSiScheduler = 0
+options.updateDescInElf = 0
+options.allowVaryWaveSize = 0
+options.enableLoadScalarizer = 0
+options.disableLicm = 0
+options.unrollThreshold = 0
+options.scalarThreshold = 0
+options.disableLoopUnroll = 0
+options.fp32DenormalMode = Auto
+options.adjustDepthImportVrs = 0
+options.disableLicmThreshold = 0
+options.unrollHintThreshold = 0
+options.dontUnrollHintThreshold = 0
+
+[missGlsl]
+#version 460
+#extension GL_EXT_ray_tracing : enable
+
+void main()
+{
+}
+
+
+[missInfo]
+entryPoint = main
+options.trapPresent = 0
+options.debugMode = 0
+options.enablePerformanceData = 0
+options.allowReZ = 0
+options.vgprLimit = 0
+options.sgprLimit = 0
+options.maxThreadGroupsPerComputeUnit = 0
+options.waveSize = 0
+options.wgpMode = 0
+options.waveBreakSize = None
+options.forceLoopUnrollCount = 0
+options.useSiScheduler = 0
+options.updateDescInElf = 0
+options.allowVaryWaveSize = 0
+options.enableLoadScalarizer = 0
+options.disableLicm = 0
+options.unrollThreshold = 0
+options.scalarThreshold = 0
+options.disableLoopUnroll = 0
+options.fp32DenormalMode = Auto
+options.adjustDepthImportVrs = 0
+options.disableLicmThreshold = 0
+options.unrollHintThreshold = 0
+options.dontUnrollHintThreshold = 0
+
+[chitGlsl]
+#version 460
+#extension GL_EXT_ray_tracing : enable
+
+layout(location = 0) rayPayloadInEXT vec3 hitValue;
+
+void main()
+{
+    hitValue = vec3(3, 4, 5);
+}
+
+[chitInfo]
+entryPoint = main
+options.trapPresent = 0
+options.debugMode = 0
+options.enablePerformanceData = 0
+options.allowReZ = 0
+options.vgprLimit = 0
+options.sgprLimit = 0
+options.maxThreadGroupsPerComputeUnit = 0
+options.waveSize = 0
+options.wgpMode = 0
+options.waveBreakSize = None
+options.forceLoopUnrollCount = 0
+options.useSiScheduler = 0
+options.updateDescInElf = 0
+options.allowVaryWaveSize = 0
+options.enableLoadScalarizer = 0
+options.disableLicm = 0
+options.unrollThreshold = 0
+options.scalarThreshold = 0
+options.disableLoopUnroll = 0
+options.fp32DenormalMode = Auto
+options.adjustDepthImportVrs = 0
+options.disableLicmThreshold = 0
+options.unrollHintThreshold = 0
+options.dontUnrollHintThreshold = 0
+
+[sectGlsl]
+#version 460
+#extension GL_EXT_ray_tracing : enable
+
+void main()
+{
+    reportIntersectionEXT(0.5, 0u);
+}
+
+[sectInfo]
+entryPoint = main
+options.trapPresent = 0
+options.debugMode = 0
+options.enablePerformanceData = 0
+options.allowReZ = 0
+options.vgprLimit = 0
+options.sgprLimit = 0
+options.maxThreadGroupsPerComputeUnit = 0
+options.waveSize = 0
+options.wgpMode = 0
+options.waveBreakSize = None
+options.forceLoopUnrollCount = 0
+options.useSiScheduler = 0
+options.updateDescInElf = 0
+options.allowVaryWaveSize = 0
+options.enableLoadScalarizer = 0
+options.disableLicm = 0
+options.unrollThreshold = 0
+options.scalarThreshold = 0
+options.disableLoopUnroll = 0
+options.fp32DenormalMode = Auto
+options.adjustDepthImportVrs = 0
+options.disableLicmThreshold = 0
+options.unrollHintThreshold = 0
+options.dontUnrollHintThreshold = 0
+
+[ResourceMapping]
+userDataNode[0].visibility = 1728
+userDataNode[0].type = DescriptorTableVaPtr
+userDataNode[0].offsetInDwords = 0
+userDataNode[0].sizeInDwords = 1
+userDataNode[0].next[0].type = DescriptorConstBuffer
+userDataNode[0].next[0].offsetInDwords = 0
+userDataNode[0].next[0].sizeInDwords = 4
+userDataNode[0].next[0].set = 0x00000000
+userDataNode[0].next[0].binding = 0
+userDataNode[0].next[1].type = DescriptorImage
+userDataNode[0].next[1].offsetInDwords = 4
+userDataNode[0].next[1].sizeInDwords = 8
+userDataNode[0].next[1].set = 0x00000000
+userDataNode[0].next[1].binding = 1
+userDataNode[0].next[2].type = DescriptorImage
+userDataNode[0].next[2].offsetInDwords = 12
+userDataNode[0].next[2].sizeInDwords = 8
+userDataNode[0].next[2].set = 0x00000000
+userDataNode[0].next[2].binding = 2
+userDataNode[0].next[3].type = DescriptorConstBuffer
+userDataNode[0].next[3].offsetInDwords = 20
+userDataNode[0].next[3].sizeInDwords = 4
+userDataNode[0].next[3].set = 0x00000000
+userDataNode[0].next[3].binding = 3
+userDataNode[0].next[4].type = DescriptorBuffer
+userDataNode[0].next[4].offsetInDwords = 24
+userDataNode[0].next[4].sizeInDwords = 4
+userDataNode[0].next[4].set = 0x00000000
+userDataNode[0].next[4].binding = 4
+userDataNode[0].next[5].type = DescriptorBuffer
+userDataNode[0].next[5].offsetInDwords = 28
+userDataNode[0].next[5].sizeInDwords = 4
+userDataNode[0].next[5].set = 0x00000000
+userDataNode[0].next[5].binding = 5
+userDataNode[0].next[6].type = DescriptorBuffer
+userDataNode[0].next[6].offsetInDwords = 32
+userDataNode[0].next[6].sizeInDwords = 4
+userDataNode[0].next[6].set = 0x00000000
+userDataNode[0].next[6].binding = 6
+userDataNode[0].next[7].type = DescriptorBuffer
+userDataNode[0].next[7].offsetInDwords = 36
+userDataNode[0].next[7].sizeInDwords = 4
+userDataNode[0].next[7].set = 0x00000000
+userDataNode[0].next[7].binding = 7
+userDataNode[0].next[8].type = DescriptorCombinedTexture
+userDataNode[0].next[8].offsetInDwords = 40
+userDataNode[0].next[8].sizeInDwords = 12
+userDataNode[0].next[8].set = 0x00000000
+userDataNode[0].next[8].binding = 8
+userDataNode[0].next[9].type = DescriptorBuffer
+userDataNode[0].next[9].offsetInDwords = 52
+userDataNode[0].next[9].sizeInDwords = 4
+userDataNode[0].next[9].set = 0x00000000
+userDataNode[0].next[9].binding = 9
+userDataNode[1].visibility = 4032
+userDataNode[1].type = DescriptorTableVaPtr
+userDataNode[1].offsetInDwords = 2
+userDataNode[1].sizeInDwords = 1
+userDataNode[1].next[0].type = DescriptorConstBufferCompact
+userDataNode[1].next[0].offsetInDwords = 0
+userDataNode[1].next[0].sizeInDwords = 2
+userDataNode[1].next[0].set = 0x0000005D
+userDataNode[1].next[0].binding = 17
+userDataNode[1].next[1].type = DescriptorConstBuffer
+userDataNode[1].next[1].offsetInDwords = 2
+userDataNode[1].next[1].sizeInDwords = 4
+userDataNode[1].next[1].set = 0x0000005D
+userDataNode[1].next[1].binding = 0
+userDataNode[1].next[2].type = DescriptorBuffer
+userDataNode[1].next[2].offsetInDwords = 6
+userDataNode[1].next[2].sizeInDwords = 4
+userDataNode[1].next[2].set = 0x0000005D
+userDataNode[1].next[2].binding = 1
+
+[RayTracingPipelineState]
+deviceIndex = 0
+options.includeDisassembly = 0
+options.scalarBlockLayout = 0
+options.includeIr = 0
+options.robustBufferAccess = 0
+options.reconfigWorkgroupLayout = 0
+options.shadowDescriptorTableUsage = Auto
+options.shadowDescriptorTablePtrHigh = 0
+options.extendedRobustness.robustBufferAccess = 0
+options.extendedRobustness.robustImageAccess = 0
+options.extendedRobustness.nullDescriptor = 0
+groups[0].type = VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR
+groups[0].generalShader = 0
+groups[0].closestHitShader = -1
+groups[0].anyHitShader = -1
+groups[0].intersectionShader = -1
+groups[1].type = VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR
+groups[1].generalShader = 1
+groups[1].closestHitShader = -1
+groups[1].anyHitShader = -1
+groups[1].intersectionShader = -1
+groups[2].type = VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR
+groups[2].generalShader = -1
+groups[2].closestHitShader = 2
+groups[2].anyHitShader = -1
+groups[2].intersectionShader = -1
+groups[3].type = VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR
+groups[3].generalShader = -1
+groups[3].closestHitShader = 3
+groups[3].anyHitShader = -1
+groups[3].intersectionShader = 4
+maxRecursionDepth = 1
+indirectStageMask = 4294967295
+mode = 3
+rtState.bvhResDescSize = 4
+rtState.bvhResDesc[0] = 0
+rtState.bvhResDesc[1] = 2197815296
+rtState.bvhResDesc[2] = 4294967295
+rtState.bvhResDesc[3] = 2164261887
+rtState.nodeStrideShift = 7
+rtState.staticPipelineFlags = 0
+rtState.triCompressMode = 0
+rtState.pipelineFlags = 0
+rtState.threadGroupSizeX = 8
+rtState.threadGroupSizeY = 4
+rtState.threadGroupSizeZ = 1
+rtState.boxSortHeuristicMode = 0
+rtState.counterMode = 0
+rtState.counterMask = 0
+rtState.rayQueryCsSwizzle = 1
+rtState.ldsStackSize = 16
+rtState.dispatchRaysThreadGroupSize = 32
+rtState.ldsSizePerThreadGroup = 65536
+rtState.outerTileSize = 4
+rtState.dispatchDimSwizzleMode = 0
+rtState.exportConfig.indirectCallingConvention = 1
+rtState.exportConfig.indirectCalleeSavedRegs.raygen = 2
+rtState.exportConfig.indirectCalleeSavedRegs.miss = 40
+rtState.exportConfig.indirectCalleeSavedRegs.closestHit = 50
+rtState.exportConfig.indirectCalleeSavedRegs.anyHit = 75
+rtState.exportConfig.indirectCalleeSavedRegs.intersection = 75
+rtState.exportConfig.indirectCalleeSavedRegs.callable = 28
+rtState.exportConfig.indirectCalleeSavedRegs.traceRays = 28
+rtState.exportConfig.enableUniformNoReturn = 1
+rtState.exportConfig.enableTraceRayArgsInLds = 0
+rtState.exportConfig.readsDispatchRaysIndex = 0
+rtState.exportConfig.enableDynamicLaunch = 0
+rtState.exportConfig.emitRaytracingShaderDataToken = 0
+rtState.enableRayQueryCsSwizzle = 0
+rtState.enableDispatchRaysInnerSwizzle = 1
+rtState.enableDispatchRaysOuterSwizzle = 1
+rtState.forceInvalidAccelStruct = 0
+rtState.enableRayTracingCounters = 0
+rtState.enableOptimalLdsStackSizeForIndirect = 1
+rtState.enableOptimalLdsStackSizeForUnified = 1
+rtState.gpurtFuncTable.pFunc[0] = TraceRay1_1
+rtState.gpurtFuncTable.pFunc[1] = TraceRayInline1_1
+rtState.gpurtFuncTable.pFunc[2] = TraceRayUsingHitToken1_1
+rtState.gpurtFuncTable.pFunc[3] = RayQueryProceed1_1
+rtState.gpurtFuncTable.pFunc[4] = GetInstanceIndex
+rtState.gpurtFuncTable.pFunc[5] = GetInstanceID
+rtState.gpurtFuncTable.pFunc[6] = GetObjectToWorldTransform
+rtState.gpurtFuncTable.pFunc[7] = GetWorldToObjectTransform
+rtState.gpurtFuncTable.pFunc[8] = TraceLongRayAMD1_1
+rtState.gpurtFuncTable.pFunc[9] = LongRayQueryProceed1_1
+payloadSizeMaxInLib = 16
+attributeSizeMaxInLib = 0
+hasPipelineLibrary = 0
diff --git a/llpc/test/shaderdb/ray_tracing/PipelineRays_Continuations_SpecializeDriverShaders.pipe b/llpc/test/shaderdb/ray_tracing/PipelineRays_Continuations_SpecializeDriverShaders.pipe
index cc50265c6d..a4ada2ac99 100644
--- a/llpc/test/shaderdb/ray_tracing/PipelineRays_Continuations_SpecializeDriverShaders.pipe
+++ b/llpc/test/shaderdb/ray_tracing/PipelineRays_Continuations_SpecializeDriverShaders.pipe
@@ -192,8 +192,8 @@ entryPoint = main
 // For intersection, do not force the line to end with ARG_SLOTS, as intersection pessimistically preserves
 // payload VGPRs, and thus may see a larger number of args:
 // CHECK-NEXT: [SDS] [[ARG_SLOTS]]
-// Check that intersection pessimistically preserves 32 payload VGPRs:
-// CHECK-NEXT: [SDS] {{[CUDP]+}}PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP{{$}}
+// Check that at least 7 dwords at the end are preserved for the payload:
+// CHECK-NEXT: [SDS] {{[CUDP]+}}PPPPPPP{{$}}
 void main()
 {
     reportIntersectionEXT(0.5, 0u);
diff --git a/llpc/test/shaderdb/ray_tracing/PipelineRays_Irreducible.pipe b/llpc/test/shaderdb/ray_tracing/PipelineRays_Irreducible.pipe
index 4005996e4a..6d51383992 100644
--- a/llpc/test/shaderdb/ray_tracing/PipelineRays_Irreducible.pipe
+++ b/llpc/test/shaderdb/ray_tracing/PipelineRays_Irreducible.pipe
@@ -1,7 +1,7 @@
 ; RUN: amdllpc -gfxip 11.0 -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK %s
 
 ; This case will have irreducible CFG after continuation transform.
-; Without fixing irreducible, PatchBufferOp would generate several global load.
+; Without fixing irreducible, LowerBufferOperations would generate several global load.
 ; CHECK-LABEL: @_rgen_1.resume.0(
 ; CHECK-NOT:   load {{.*}}, ptr addrspace(1)
 
diff --git a/llpc/test/shaderdb/ray_tracing/PipelineRays_TestRtIgnoreDeclaredPayloadSize.pipe b/llpc/test/shaderdb/ray_tracing/PipelineRays_TestRtIgnoreDeclaredPayloadSize.pipe
new file mode 100644
index 0000000000..240415501d
--- /dev/null
+++ b/llpc/test/shaderdb/ray_tracing/PipelineRays_TestRtIgnoreDeclaredPayloadSize.pipe
@@ -0,0 +1,280 @@
+; Test that only payloadSizeMaxInLib would be honored when rtIgnoreDeclaredPayloadSize is enable.
+; BEGIN_SHADERTEST
+; RUN: amdllpc %gfxip -emit-lgc %s -o - | FileCheck -check-prefix=SHADERTEST %s
+; SHADERTEST-LABEL: @_rgen_1(
+; SHADERTEST:    %{{.*}} = call spir_func [{{[0-9]}} x i32] %{{.*}}([3 x i32] [i32 0, i32 undef, i32 undef], {{.*}}) #[[ATTR0:[0-9]+]], !continufy.stage !18
+; END_SHADERTEST
+
+[Version]
+version = 75
+
+[rgenGlsl]
+#version 460
+#extension GL_EXT_ray_tracing : require
+#extension GL_EXT_nonuniform_qualifier : require
+
+struct _43
+{
+    uint _m0[17];
+};
+
+layout(set = 1, binding = 0, std430) restrict readonly buffer RTASHeap
+{
+    uvec2 _m0[];
+} _29;
+
+layout(push_constant, std430) uniform RootConstants
+{
+    uint _m0;
+    uint _m1;
+} registers;
+
+layout(set = 1, binding = 1) uniform texture2D _15[];
+layout(set = 1, binding = 1) uniform writeonly image2D _33[];
+layout(location = 0) rayPayloadEXT _43 _45;
+
+void main()
+{
+    _45._m0[0] = 0u;
+    _45._m0[4] = 4u;
+
+    traceRayEXT(accelerationStructureEXT(_29._m0[registers._m0 + 9u]), 28u, 1u, 1u, 2u, 0u, vec3(0.0), 0.0, vec3(1.0), 1000.0, 0);
+    imageStore(_33[registers._m1 + 1u], ivec2(gl_LaunchIDEXT.xy), vec4(_45._m0[0]));
+}
+
+[rgenInfo]
+entryPoint = main
+options.clientHash = 0x0, 0x0
+options.trapPresent = 0
+options.debugMode = 0
+options.enablePerformanceData = 0
+options.allowReZ = 0
+options.vgprLimit = 120
+options.sgprLimit = 0
+options.maxThreadGroupsPerComputeUnit = 0
+options.subgroupSize = 0
+options.waveSize = 32
+options.wgpMode = 0
+options.waveBreakSize = None
+options.forceLoopUnrollCount = 0
+options.enableLoadScalarizer = 0
+options.allowVaryWaveSize = 1
+options.useSiScheduler = 0
+options.disableCodeSinking = 0
+options.favorLatencyHiding = 0
+options.disableLicm = 0
+options.unrollThreshold = 0
+options.scalarThreshold = 0
+options.disableLoopUnroll = 0
+options.adjustDepthImportVrs = 0
+options.fp32DenormalMode = Auto
+options.disableLicmThreshold = 0
+options.unrollHintThreshold = 0
+options.dontUnrollHintThreshold = 0
+options.noContractOpDot = 0
+options.fastMathFlags = 0
+options.disableFastMathFlags = 0
+options.ldsSpillLimitDwords = 0
+options.overrideForceThreadIdSwizzling = 0
+options.overrideShaderThreadGroupSizeX = 0
+options.overrideShaderThreadGroupSizeY = 0
+options.overrideShaderThreadGroupSizeZ = 0
+options.forceLateZ = 0
+options.nsaThreshold = 0
+options.aggressiveInvariantLoads = Auto
+options.workaroundStorageImageFormats = 0
+options.workaroundInitializeOutputsToZero = 0
+options.disableFMA = 0
+options.disableReadFirstLaneWorkaround = 0
+options.backwardPropagateNoContract = 0
+options.forwardPropagateNoContract = 1
+options.workgroupRoundRobin = 0
+options.constantBufferBindingOffset = 0
+options.imageSampleDrefReturnsRgba = 0
+options.disableGlPositionOpt = 0
+options.viewIndexFromDeviceIndex = 0
+options.resourceCount = 0
+options.forceUnderflowPrevention = 0
+options.forceMemoryBarrierScope = 0
+
+[ResourceMapping]
+userDataNode[0].visibility = 16128
+userDataNode[0].type = DescriptorTableVaPtr
+userDataNode[0].offsetInDwords = 43
+userDataNode[0].sizeInDwords = 1
+userDataNode[0].next[0].type = DescriptorConstBufferCompact
+userDataNode[0].next[0].offsetInDwords = 0
+userDataNode[0].next[0].sizeInDwords = 2
+userDataNode[0].next[0].set = 0x0000005D
+userDataNode[0].next[0].binding = 17
+userDataNode[0].next[0].strideInDwords = 0
+userDataNode[0].next[1].type = DescriptorConstBuffer
+userDataNode[0].next[1].offsetInDwords = 2
+userDataNode[0].next[1].sizeInDwords = 8
+userDataNode[0].next[1].set = 0x0000005D
+userDataNode[0].next[1].binding = 0
+userDataNode[0].next[1].strideInDwords = 0
+userDataNode[0].next[2].type = DescriptorBuffer
+userDataNode[0].next[2].offsetInDwords = 10
+userDataNode[0].next[2].sizeInDwords = 8
+userDataNode[0].next[2].set = 0x0000005D
+userDataNode[0].next[2].binding = 1
+userDataNode[0].next[2].strideInDwords = 0
+userDataNode[1].visibility = 2
+userDataNode[1].type = StreamOutTableVaPtr
+userDataNode[1].offsetInDwords = 2
+userDataNode[1].sizeInDwords = 1
+userDataNode[2].visibility = 16128
+userDataNode[2].type = PushConst
+userDataNode[2].offsetInDwords = 3
+userDataNode[2].sizeInDwords = 38
+userDataNode[2].set = 0xFFFFFFFF
+userDataNode[2].binding = 0
+userDataNode[2].strideInDwords = 0
+userDataNode[3].visibility = 16128
+userDataNode[3].type = DescriptorTableVaPtr
+userDataNode[3].offsetInDwords = 45
+userDataNode[3].sizeInDwords = 1
+userDataNode[3].next[0].type = DescriptorBuffer
+userDataNode[3].next[0].offsetInDwords = 0
+userDataNode[3].next[0].sizeInDwords = 16
+userDataNode[3].next[0].set = 0x00000001
+userDataNode[3].next[0].binding = 0
+userDataNode[3].next[0].strideInDwords = 0
+userDataNode[3].next[1].type = DescriptorMutable
+userDataNode[3].next[1].offsetInDwords = 16
+userDataNode[3].next[1].sizeInDwords = 8000000
+userDataNode[3].next[1].set = 0x00000001
+userDataNode[3].next[1].binding = 1
+userDataNode[3].next[1].strideInDwords = 8
+
+[RayTracingPipelineState]
+deviceIndex = 0
+options.includeDisassembly = 0
+options.scalarBlockLayout = 1
+options.reconfigWorkgroupLayout = 0
+options.forceCsThreadIdSwizzling = 0
+options.includeIr = 0
+options.robustBufferAccess = 1
+options.enableRelocatableShaderElf = 0
+options.disableImageResourceCheck = 0
+options.enableScratchAccessBoundsChecks = 0
+options.enableImplicitInvariantExports = 1
+options.shadowDescriptorTableUsage = Disable
+options.shadowDescriptorTablePtrHigh = 0
+options.extendedRobustness.robustBufferAccess = 1
+options.extendedRobustness.robustImageAccess = 1
+options.extendedRobustness.nullDescriptor = 1
+options.enableRayQuery = 0
+options.optimizeTessFactor = 1
+options.enableInterpModePatch = 0
+options.pageMigrationEnabled = 0
+options.optimizationLevel = 2
+options.overrideThreadGroupSizeX = 0
+options.overrideThreadGroupSizeY = 0
+options.overrideThreadGroupSizeZ = 0
+options.resourceLayoutScheme = Compact
+options.threadGroupSwizzleMode = Default
+options.reverseThreadGroup = 0
+options.internalRtShaders = 0
+options.forceNonUniformResourceIndexStageMask = 0
+options.expertSchedulingMode = 0
+options.glState.replaceSetWithResourceType = 0
+options.glState.disableSampleMask = 0
+options.glState.buildResourcesDataForShaderModule = 0
+options.glState.disableTruncCoordForGather = 1
+options.glState.enableCombinedTexture = 0
+options.glState.vertex64BitsAttribSingleLoc = 0
+options.glState.enableFragColor = 0
+options.glState.disableBaseVertex = 0
+options.glState.enablePolygonStipple = 0
+options.glState.enableLineSmooth = 0
+options.glState.emulateWideLineStipple = 0
+options.glState.enablePointSmooth = 0
+options.glState.enableRemapLocation = 0
+options.cacheScopePolicyControl = 0
+options.temporalHintControl = 0x777777
+options.enablePrimGeneratedQuery = 0
+options.disablePerCompFetch = 0
+options.optimizePointSizeWrite = 1
+groups[0].type = VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR
+groups[0].generalShader = 0
+groups[0].closestHitShader = -1
+groups[0].anyHitShader = -1
+groups[0].intersectionShader = -1
+maxRecursionDepth = 2
+indirectStageMask = 4294967295
+libraryMode = 2
+mode = 1
+cpsFlags = 0
+disableDynamicVgpr = 0
+dynamicVgprBlockSize =0
+rtState.nodeStrideShift = 7
+rtState.bvhResDescSize = 4
+rtState.bvhResDesc[0] = 0
+rtState.bvhResDesc[1] = 2197815296
+rtState.bvhResDesc[2] = 4294967295
+rtState.bvhResDesc[3] = 2172650495
+rtState.staticPipelineFlags = 0
+rtState.triCompressMode = 3
+rtState.boxSortHeuristicMode = 5
+rtState.pipelineFlags = 536872960
+rtState.counterMode = 0
+rtState.counterMask = 0
+rtState.threadGroupSizeX = 8
+rtState.threadGroupSizeY = 4
+rtState.threadGroupSizeZ = 1
+rtState.rayQueryCsSwizzle = 1
+rtState.ldsStackSize = 16
+rtState.dispatchRaysThreadGroupSize = 32
+rtState.ldsSizePerThreadGroup = 65536
+rtState.outerTileSize = 4
+rtState.dispatchDimSwizzleMode = 0
+rtState.exportConfig.indirectCallingConvention = 1
+rtState.exportConfig.indirectCalleeSavedRegs.raygen = 2
+rtState.exportConfig.indirectCalleeSavedRegs.miss = 40
+rtState.exportConfig.indirectCalleeSavedRegs.closestHit = 50
+rtState.exportConfig.indirectCalleeSavedRegs.anyHit = 75
+rtState.exportConfig.indirectCalleeSavedRegs.intersection = 75
+rtState.exportConfig.indirectCalleeSavedRegs.callable = 28
+rtState.exportConfig.indirectCalleeSavedRegs.traceRays = 28
+rtState.exportConfig.enableUniformNoReturn = 1
+rtState.exportConfig.enableTraceRayArgsInLds = 0
+rtState.exportConfig.enableReducedLinkageOpt = 0
+rtState.exportConfig.readsDispatchRaysIndex = 0
+rtState.exportConfig.enableDynamicLaunch = 0
+rtState.exportConfig.emitRaytracingShaderDataToken = 0
+rtState.enableRayQueryCsSwizzle = 0
+rtState.enableDispatchRaysInnerSwizzle = 1
+rtState.enableDispatchRaysOuterSwizzle = 1
+rtState.forceInvalidAccelStruct = 0
+rtState.enableRayTracingCounters = 0
+rtState.enableRayTracingHwTraversalStack = 1
+rtState.enableOptimalLdsStackSizeForIndirect = 1
+rtState.enableOptimalLdsStackSizeForUnified = 1
+rtState.maxRayLength = 0
+rtState.enablePickClosestLaneResultForAbortRays = 0
+rtState.traceRayWaveDensityThreshold[8] = 1
+rtState.gpurtFeatureFlags = 0
+rtState.gpurtFuncTable.pFunc[0] = TraceRay2_0
+rtState.gpurtFuncTable.pFunc[1] = TraceRayInline2_0
+rtState.gpurtFuncTable.pFunc[2] = TraceRayUsingHitToken2_0
+rtState.gpurtFuncTable.pFunc[3] = RayQueryProceed2_0
+rtState.gpurtFuncTable.pFunc[4] = GetInstanceIndex
+rtState.gpurtFuncTable.pFunc[5] = GetInstanceID
+rtState.gpurtFuncTable.pFunc[6] = GetObjectToWorldTransform
+rtState.gpurtFuncTable.pFunc[7] = GetWorldToObjectTransform
+rtState.gpurtFuncTable.pFunc[8] = GetRayQuery64BitInstanceNodePtr
+rtState.gpurtFuncTable.pFunc[9] = TraceLongRayAMD2_0
+rtState.gpurtFuncTable.pFunc[10] = LongRayQueryProceedAMD2_0
+rtState.gpurtFuncTable.pFunc[11] = FetchTrianglePositionFromNodePointer
+rtState.gpurtFuncTable.pFunc[12] = FetchTrianglePositionFromRayQuery
+rtState.rtIpVersion = 2.0
+rtState.gpurtOverride = 0
+rtState.rtIpOverride = 0
+payloadSizeMaxInLib = 12
+attributeSizeMaxInLib = 8
+hasPipelineLibrary = 0
+pipelineLibStageMask = 0
+rtIgnoreDeclaredPayloadSize = 1
+
diff --git a/llpc/test/shaderdb/ray_tracing/TestContState.rgen b/llpc/test/shaderdb/ray_tracing/TestContState.rgen
new file mode 100644
index 0000000000..3979e00d25
--- /dev/null
+++ b/llpc/test/shaderdb/ray_tracing/TestContState.rgen
@@ -0,0 +1,39 @@
+
+// Test that payload is not put into continuations state.
+// RUN: amdllpc %gfxip --llpc-raytracing-mode=continuations --report-cont-state-sizes %s 2>&1 | FileCheck -check-prefix=CHECK %s
+// CHECK: Continuation state size of "_rgen_1" (raygeneration): 0 bytes
+#version 460
+#extension GL_EXT_ray_tracing : require
+
+struct RayPayload {
+  vec4 a;
+  vec4 b;
+  float c;
+};
+
+layout(binding = 0, set = 0) uniform accelerationStructureEXT g_bvh;
+layout(binding = 1, set = 0, rgba32f) uniform image2D g_dst;
+layout(binding = 2, set = 0) uniform Buf {
+  vec4 u_a;
+  vec4 u_b;
+  float u_c;
+};
+
+layout(location = 14) rayPayloadEXT RayPayload g_ray;
+
+void main() {
+  g_ray.a = u_a;
+  g_ray.b = u_b;
+  g_ray.c = u_c;
+  vec3 origin;
+  origin.x = gl_LaunchIDEXT.x;
+  origin.y = gl_LaunchIDEXT.y;
+  origin.z = 0;
+
+  traceRayEXT(g_bvh, /* ray flags */ 0, /* cull mask */ 0xff, 
+              /* sbt offset */ 0, /* sbt stride */ 1, /* miss index */ 0,
+              origin.xyz, /* tmin */ 0.0, /* direction */ vec3(1, 0, 0),
+              /* tmax */ 48.0, /* payload location */ 14);
+
+  imageStore(g_dst, ivec2(gl_LaunchIDEXT.xy), g_ray.a);
+}
diff --git a/llpc/test/shaderdb/ray_tracing/TestPayloadSizes.rgen b/llpc/test/shaderdb/ray_tracing/TestPayloadSizes.rgen
new file mode 100644
index 0000000000..c0d7376e05
--- /dev/null
+++ b/llpc/test/shaderdb/ray_tracing/TestPayloadSizes.rgen
@@ -0,0 +1,48 @@
+// NOTE : Do not autogenerate
+
+// Check that we calculate payload size correctly using scalar alignment (requested by spec)
+
+#version 460 // COMMON
+#extension GL_EXT_ray_tracing : enable // COMMON
+#extension GL_EXT_shader_explicit_arithmetic_types : require // COMMON
+
+// Different payload types for testing
+
+#define PAYLOAD_TYPE vec3 // TEST-1
+
+#define PAYLOAD_TYPE struct { float a; double b; } // TEST-2
+
+#define PAYLOAD_TYPE vec3[2] // TEST-3
+
+#define PAYLOAD_TYPE struct { float a; vec4 b; vec3 c; } // TEST-4
+
+#define PAYLOAD_TYPE float16_t[3] // TEST-5
+
+// Common testing part
+layout(location = 0) rayPayloadEXT PAYLOAD_TYPE g_ray; // COMMON
+layout(binding = 0, set = 0) uniform accelerationStructureEXT g_bvh; // COMMON
+
+void main() { // COMMON
+  vec3 origin = {gl_LaunchIDEXT.x, gl_LaunchIDEXT.y, 0}; // COMMON
+  traceRayEXT(g_bvh, 0, 0xff, 0, 1, 0, origin.xyz, 0.0, vec3(1, 0, 0), 48.0, 0); // COMMON
+} // COMMON
+
+// RUN: grep -e COMMON -e TEST-1 %s > %t.rgen
+// RUN: amdllpc -gfxip=11.0 -llpc-raytracing-mode=continuations --report-payload-register-sizes=max %t.rgen 2>&1 | FileCheck -check-prefix=CHECK-1 %s
+// CHECK-1: Incoming and max outgoing payload VGPR size of "_rgen_1" (raygeneration): 0 and 3 dwords
+
+// RUN: grep -e COMMON -e TEST-2 %s > %t.rgen
+// RUN: amdllpc -gfxip=11.0 -llpc-raytracing-mode=continuations --report-payload-register-sizes=max %t.rgen 2>&1 | FileCheck -check-prefix=CHECK-2 %s
+// CHECK-2: Incoming and max outgoing payload VGPR size of "_rgen_1" (raygeneration): 0 and 4 dwords
+
+// RUN: grep -e COMMON -e TEST-3 %s > %t.rgen
+// RUN: amdllpc -gfxip=11.0 -llpc-raytracing-mode=continuations --report-payload-register-sizes=max %t.rgen 2>&1 | FileCheck -check-prefix=CHECK-3 %s
+// CHECK-3: Incoming and max outgoing payload VGPR size of "_rgen_1" (raygeneration): 0 and 6 dwords
+
+// RUN: grep -e COMMON -e TEST-4 %s > %t.rgen
+// RUN: amdllpc -gfxip=11.0 -llpc-raytracing-mode=continuations --report-payload-register-sizes=max %t.rgen 2>&1 | FileCheck -check-prefix=CHECK-4 %s
+// CHECK-4: Incoming and max outgoing payload VGPR size of "_rgen_1" (raygeneration): 0 and 8 dwords
+
+// RUN: grep -e COMMON -e TEST-5 %s > %t.rgen
+// RUN: amdllpc -gfxip=11.0 -llpc-raytracing-mode=continuations --report-payload-register-sizes=max %t.rgen 2>&1 | FileCheck -check-prefix=CHECK-5 %s
+// CHECK-5: Incoming and max outgoing payload VGPR size of "_rgen_1" (raygeneration): 0 and 2 dwords
diff --git a/llpc/test/shaderdb/relocatable_shaders/PipelineGs_BasicRelocGsTest.pipe b/llpc/test/shaderdb/relocatable_shaders/PipelineGs_BasicRelocGsTest.pipe
index 5b15dc4f89..89316ac4ef 100644
--- a/llpc/test/shaderdb/relocatable_shaders/PipelineGs_BasicRelocGsTest.pipe
+++ b/llpc/test/shaderdb/relocatable_shaders/PipelineGs_BasicRelocGsTest.pipe
@@ -10,11 +10,11 @@
 ; SHADERTEST-LABEL: PalMetadata
 ; SHADERTEST-LABEL: .hardware_stages
 ; SHADERTEST-LABEL: .gs:
-; SHADERTEST-LABEL: .entry_point: _amdgpu_gs_main
+; SHADERTEST-LABEL: .entry_point{{(_symbol)?}}: _amdgpu_gs_main
 ; SHADERTEST-LABEL: .ps:
-; SHADERTEST-LABEL: .entry_point: _amdgpu_ps_main
+; SHADERTEST-LABEL: .entry_point{{(_symbol)?}}: _amdgpu_ps_main
 ; SHADERTEST-LABEL: .vs:
-; SHADERTEST-LABEL: .entry_point: _amdgpu_vs_main
+; SHADERTEST-LABEL: .entry_point{{(_symbol)?}}: _amdgpu_vs_main
 ; SHADERTEST-LABEL: .type: Gs
 ; SHADERTEST: AMDLLPC SUCCESS
 ; END_SHADERTEST
diff --git a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_EnableColorExport.pipe b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_EnableColorExport.pipe
index 6471ed5c79..f246f2613f 100644
--- a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_EnableColorExport.pipe
+++ b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_EnableColorExport.pipe
@@ -85,7 +85,7 @@ attribute[0].offset = 0
 ; SHADERTEST-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
 ; SHADERTEST-NEXT:    [[TMP2:%.*]] = call i64 @llvm.amdgcn.s.getpc()
 ; SHADERTEST-NEXT:    [[TMP3:%.*]] = bitcast i64 [[TMP2]] to <2 x i32>
-; SHADERTEST-NEXT:    [[INTERPPERSPCENTER:%.*]] = call <2 x float> @lgc.input.import.builtin.InterpPerspCenter.v2f32.i32(i32 268435457) #[[ATTR4:[0-9]+]]
+; SHADERTEST-NEXT:    [[INTERPPERSPCENTER:%.*]] = call <2 x float> @lgc.input.import.builtin.InterpPerspCenter.v2f32.i32(i32 268435457) #[[ATTR5:[0-9]+]]
 ; SHADERTEST-NEXT:    [[TMP4:%.*]] = call <2 x float> (...) @lgc.input.import.interpolated__v2f32(i1 false, i32 0, i32 0, i32 0, i32 poison, i32 0, <2 x float> [[INTERPPERSPCENTER]])
 ; SHADERTEST-NEXT:    [[TMP5:%.*]] = call i32 @lgc.load.user.data__i32(i32 44)
 ; SHADERTEST-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 0
@@ -101,12 +101,12 @@ attribute[0].offset = 0
 ; SHADERTEST-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP13]], i32 0
 ; SHADERTEST-NEXT:    [[TMP15:%.*]] = fptosi <2 x float> [[TMP4]] to <2 x i32>
 ; SHADERTEST-NEXT:    [[TMP16:%.*]] = sitofp <2 x i32> [[TMP15]] to <2 x float>
-; SHADERTEST-NEXT:    [[TMP17:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP14]], align 4, !invariant.load !11
-; SHADERTEST-NEXT:    [[TMP18:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP9]], align 4, !invariant.load !11
+; SHADERTEST-NEXT:    [[TMP17:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP14]], align 4, !invariant.load !12
+; SHADERTEST-NEXT:    [[TMP18:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP9]], align 4, !invariant.load !12
 ; SHADERTEST-NEXT:    [[TMP19:%.*]] = extractelement <2 x float> [[TMP16]], i64 0
 ; SHADERTEST-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[TMP16]], i64 1
 ; SHADERTEST-NEXT:    [[TMP21:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float [[TMP19]], float [[TMP20]], <8 x i32> [[TMP17]], <4 x i32> [[TMP18]], i1 false, i32 0, i32 0)
-; SHADERTEST-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[TMP21]]) #[[ATTR5:[0-9]+]]
+; SHADERTEST-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[TMP21]]) #[[ATTR7:[0-9]+]]
 ; SHADERTEST-NEXT:    ret void
 ;
 ;
diff --git a/llpc/test/shaderdb/relocatable_shaders/VsGs_Reloc.spvasm b/llpc/test/shaderdb/relocatable_shaders/VsGs_Reloc.spvasm
index ed8dceef28..df481b674e 100644
--- a/llpc/test/shaderdb/relocatable_shaders/VsGs_Reloc.spvasm
+++ b/llpc/test/shaderdb/relocatable_shaders/VsGs_Reloc.spvasm
@@ -11,9 +11,9 @@
 ; SHADERTEST_ST-LABEL: {{^}} PalMetadata
 ; SHADERTEST_ST-LABEL: .hardware_stages
 ; SHADERTEST_ST-LABEL: .gs:
-; SHADERTEST_ST-LABEL: .entry_point: _amdgpu_gs_main
+; SHADERTEST_ST-LABEL: .entry_point{{(_symbol)?}}: _amdgpu_gs_main
 ; SHADERTEST_ST-LABEL: .vs:
-; SHADERTEST_ST-LABEL: .entry_point: _amdgpu_vs_main
+; SHADERTEST_ST-LABEL: .entry_point{{(_symbol)?}}: _amdgpu_vs_main
 ; SHADERTEST_ST-LABEL: .type: Gs
 ; SHADERTEST_ST-LABEL: {{^}}===== AMDLLPC SUCCESS =====
 ; END_SHADERTEST_ST
diff --git a/llpc/tool/amdllpc.cpp b/llpc/tool/amdllpc.cpp
index 69fb031fb7..c9f113606d 100644
--- a/llpc/tool/amdllpc.cpp
+++ b/llpc/tool/amdllpc.cpp
@@ -441,7 +441,7 @@ static Result init(int argc, char *argv[], ICompiler *&compiler, ShaderCacheWrap
         gfxipStr = gfxipStr.slice(1, StringRef::npos);
         if (!gfxipStr.consumeInteger(10, ParsedGfxIp.minor) && gfxipStr.starts_with(".")) {
           gfxipStr = gfxipStr.slice(1, StringRef::npos);
-          gfxipStr.consumeInteger(10, ParsedGfxIp.stepping);
+          gfxipStr.consumeInteger(16, ParsedGfxIp.stepping);
         }
       }
     }
diff --git a/llpc/translator/lib/SPIRV/SPIRVReader.cpp b/llpc/translator/lib/SPIRV/SPIRVReader.cpp
index b99db2ccc0..9496f53cc8 100644
--- a/llpc/translator/lib/SPIRV/SPIRVReader.cpp
+++ b/llpc/translator/lib/SPIRV/SPIRVReader.cpp
@@ -267,7 +267,7 @@ SPIRVWord getStd430AlignedTypeSize(SPIRVType *const spvType) {
   return 0;
 }
 
-static bool isStorageClassExplicitlyLaidOut(SPIRVModule *m_bm, SPIRVStorageClassKind storageClass) {
+bool SPIRVToLLVM::isStorageClassExplicitlyLaidOut(SPIRVStorageClassKind storageClass) {
   return llvm::is_contained({StorageClassStorageBuffer, StorageClassUniform, StorageClassPushConstant,
                              StorageClassPhysicalStorageBufferEXT},
                             storageClass) ||
@@ -275,6 +275,21 @@ static bool isStorageClassExplicitlyLaidOut(SPIRVModule *m_bm, SPIRVStorageClass
          (storageClass == StorageClassWorkgroup && m_bm->hasCapability(CapabilityWorkgroupMemoryExplicitLayoutKHR));
 }
 
+bool SPIRVToLLVM::isStorageClassScalarLayout(SPIRVStorageClassKind storageClass) {
+  // Per spec, maxPipelineRayPayloadSize and maxPipelineRayHitAttributeSize are calculated using scalar layout, we
+  // need to honor that here to reserve correct size for payload/attribute, e.g., when compiling library.
+  return llvm::is_contained({StorageClassRayPayloadKHR, StorageClassIncomingRayPayloadKHR, StorageClassHitAttributeKHR},
+                            storageClass);
+}
+
+LayoutMode SPIRVToLLVM::getLayoutModeForStorageClass(SPIRVStorageClassKind storageClass) {
+  if (isStorageClassExplicitlyLaidOut(storageClass))
+    return LayoutMode::Explicit;
+  if (isStorageClassScalarLayout(storageClass))
+    return LayoutMode::Scalar;
+  return LayoutMode::Native;
+}
+
 SPIRVToLLVM::SPIRVToLLVM(Module *llvmModule, SPIRVModule *theSpirvModule, const SPIRVSpecConstMap &theSpecConstMap,
                          ArrayRef<ConvertingSampler> convertingSamplers, lgc::Builder *builder,
                          const Vkgc::ShaderModuleUsage *moduleUsage, const Vkgc::PipelineShaderOptions *shaderOptions)
@@ -553,7 +568,7 @@ Type *SPIRVToLLVM::transTypeArray(SPIRVType *const spvType, const unsigned matri
     }
   }
 
-  const SPIRVWord arrayLength = opcode == OpTypeArray ? spvType->getArrayLength() : SPIRVWORD_MAX;
+  const SPIRVWord arrayLength = opcode == OpTypeArray ? spvType->getArrayLength() : 0;
   Type *const arrayType = ArrayType::get(elementType, arrayLength);
 
   // Setup the replaced array type in case this array is used in default uniform struct:
@@ -938,9 +953,9 @@ Type *SPIRVToLLVM::transTypeWithOpcode<OpTypeVector>(SPIRVType *const spvType, c
                                                      const bool isColumnMajor, LayoutMode layout) {
   Type *const compType = transType(spvType->getVectorComponentType(), matrixStride, isColumnMajor, layout);
 
-  // If the vector needs explicit/std430 layout, we need to use an array to represent it because of LLVM's data layout
-  // rules.
-  if (layout == LayoutMode::Explicit || layout == LayoutMode::Std430)
+  // If the vector needs explicit/std430/scalar layout, we need to use an array to represent it because of LLVM's data
+  // layout rules.
+  if (layout == LayoutMode::Explicit || layout == LayoutMode::Std430 || layout == LayoutMode::Scalar)
     return ArrayType::get(compType, spvType->getVectorComponentCount());
   return FixedVectorType::get(compType, spvType->getVectorComponentCount());
 }
@@ -960,7 +975,8 @@ Type *SPIRVToLLVM::transTypeWithOpcode<OpTypeCooperativeMatrixKHR>(SPIRVType *co
   unsigned rows = spvType->getCooperativeMatrixKHRRows();
   unsigned columns = spvType->getCooperativeMatrixKHRColumns();
   auto matrixLayout = getCooperativeMatrixKHRLayout(static_cast<CooperativeMatrixUse>(use), elemType, rows, columns);
-  return getBuilder()->getCooperativeMatrixTy(elemType, matrixLayout);
+  const unsigned kSize = rows > columns ? rows : columns;
+  return getBuilder()->getCooperativeMatrixTy(elemType, matrixLayout, kSize);
 }
 
 // =====================================================================================================================
@@ -994,8 +1010,8 @@ Type *SPIRVToLLVM::getPointeeType(SPIRVValue *v, LayoutMode layout) {
       return pointeeType;
   }
 
-  if (isStorageClassExplicitlyLaidOut(m_bm, v->getType()->getPointerStorageClass()))
-    layout = LayoutMode::Explicit;
+  if (getLayoutModeForStorageClass(v->getType()->getPointerStorageClass()) != LayoutMode::Native)
+    layout = getLayoutModeForStorageClass(v->getType()->getPointerStorageClass());
 
   return transType(v->getType()->getPointerElementType(), 0, true, layout);
 }
@@ -1271,9 +1287,7 @@ Value *SPIRVToLLVM::transConvertInst(SPIRVValue *bv, Function *f, BasicBlock *bb
   auto srcSpvType = bc->getOperand(0)->getType();
   auto dstSpvType = bc->getType();
   auto src = transValue(bc->getOperand(0), f, bb, bb != nullptr);
-  auto srcType = src->getType();
   auto dstType = transType(dstSpvType);
-  CastInst::CastOps co = Instruction::BitCast;
 
   // Extension for OGLP: Only valid for bindless texture/image to convert uvec2 to gsampler/gimage
   // uniform uvec2 textureHandle;
@@ -1288,6 +1302,12 @@ Value *SPIRVToLLVM::transConvertInst(SPIRVValue *bv, Function *f, BasicBlock *bb
     return transLoadBindlessImage(dstSpvType, imgDescGpuAddress, bindlessTexture);
   }
 
+  bool isDestFloat8 = false;
+  (void(isDestFloat8));
+
+  auto srcType = src->getType();
+  CastInst::CastOps co = Instruction::BitCast;
+
   lgc::CooperativeMatrixElementType srcElemTy = lgc::CooperativeMatrixElementType::Unknown;
   lgc::CooperativeMatrixElementType dstElemTy = lgc::CooperativeMatrixElementType::Unknown;
   lgc::CooperativeMatrixLayout srcLayout = lgc::CooperativeMatrixLayout::InvalidLayout;
@@ -1320,15 +1340,53 @@ Value *SPIRVToLLVM::transConvertInst(SPIRVValue *bv, Function *f, BasicBlock *bb
     co = static_cast<CastInst::CastOps>(OpCodeMap::rmap(bc->getOpCode()));
   }
 
-  if (dstType == srcType)
+  if ((dstType == srcType) && (srcElemTy == dstElemTy)) {
     return src;
+  }
+
   assert(CastInst::isCast(co) && "Invalid cast op code");
   if (bb) {
     if (bv->getType()->isTypeCooperativeMatrixKHR()) {
-      Type *matrixType = getBuilder()->getCooperativeMatrixTy(dstElemTy, dstLayout);
+      unsigned rows = static_cast<SPIRVTypeCooperativeMatrixKHR *>(dstSpvType)->getCooperativeMatrixKHRRows();
+      unsigned columns = static_cast<SPIRVTypeCooperativeMatrixKHR *>(dstSpvType)->getCooperativeMatrixKHRColumns();
+      const unsigned kSize = rows > columns ? rows : columns;
+      Type *matrixType = getBuilder()->getCooperativeMatrixTy(dstElemTy, dstLayout, kSize);
       return getBuilder()->create<CooperativeMatrixConvertOp>(matrixType, co, src, srcElemTy, dstElemTy, srcLayout,
                                                               dstLayout, "convert");
     }
+
+    if (co == Instruction::FPTrunc) {
+      if (dstType->getScalarSizeInBits() == srcType->getScalarSizeInBits()) {
+        assert(dstType->getScalarType()->isBFloatTy() || dstType->getScalarType()->isHalfTy());
+        src = getBuilder()->CreateFPExt(
+            src, dstType->isVectorTy() ? FixedVectorType::get(getBuilder()->getFloatTy(),
+                                                              cast<FixedVectorType>(dstType)->getNumElements())
+                                       : getBuilder()->getFloatTy());
+      }
+
+      RoundingMode rm = RoundingMode::Dynamic;
+      SPIRVFPRoundingModeKind rounding;
+      if (bc->hasFPRoundingMode(&rounding)) {
+        switch (rounding) {
+        case FPRoundingModeRTE:
+          rm = RoundingMode::NearestTiesToEven;
+          break;
+        case FPRoundingModeRTZ:
+          rm = RoundingMode::TowardZero;
+          break;
+        case FPRoundingModeRTP:
+          rm = RoundingMode::TowardPositive;
+          break;
+        case FPRoundingModeRTN:
+          rm = RoundingMode::TowardNegative;
+          break;
+        default:
+          llvm_unreachable("Should never be called!");
+        }
+        return getBuilder()->CreateFpTruncWithRounding(src, dstType, rm);
+      }
+    }
+
     bool srcIsPtr = srcType->isPtrOrPtrVectorTy();
     bool dstIsPtr = dstType->isPtrOrPtrVectorTy();
     // OpBitcast in SPIR-V allows casting between pointers and integers (and integer vectors),
@@ -1351,7 +1409,8 @@ Value *SPIRVToLLVM::transConvertInst(SPIRVValue *bv, Function *f, BasicBlock *bb
         return new IntToPtrInst(src, dstType, bv->getName(), bb);
       }
     } else {
-      return CastInst::Create(co, src, dstType, bv->getName(), bb);
+      Value *ret = CastInst::Create(co, src, dstType, bv->getName(), bb);
+      return ret;
     }
   }
   return ConstantExpr::getCast(co, dyn_cast<Constant>(src), dstType);
@@ -1965,7 +2024,7 @@ std::pair<Type *, Value *> SPIRVToLLVM::createLaunderRowMajorMatrix(Type *const
   Type *const matrixPointerType = pointerToMatrix->getType();
 
   Type *const newMatrixType = getTransposedType(matrixType);
-  Type *const newMatrixPointerType = newMatrixType->getPointerTo(matrixPointerType->getPointerAddressSpace());
+  Type *const newMatrixPointerType = getBuilder()->getPtrTy(matrixPointerType->getPointerAddressSpace());
 
   // Dummy value used to remember matrixType which will be used in postProcessRowMajorMatrix.
   Value *dummyValue = Constant::getNullValue(matrixType);
@@ -2032,7 +2091,7 @@ Value *SPIRVToLLVM::addLoadInstRecursively(SPIRVType *const spvType, Value *load
       auto pair = std::make_pair(spvType, i);
       if (m_overlappingStructTypeWorkaroundMap.count(pair) > 0) {
         memberLoadType = m_overlappingStructTypeWorkaroundMap[pair];
-        Type *const type = memberLoadType->getPointerTo(memberLoadPointer->getType()->getPointerAddressSpace());
+        Type *const type = getBuilder()->getPtrTy(memberLoadPointer->getType()->getPointerAddressSpace());
         memberLoadPointer = getBuilder()->CreateBitCast(memberLoadPointer, type);
       }
 
@@ -2277,7 +2336,7 @@ Constant *SPIRVToLLVM::buildConstStoreRecursively(SPIRVType *const spvType, Type
       Constant *indices[] = {zero, getBuilder()->getInt32(memberIndex)};
       Type *const memberStoreType = GetElementPtrInst::getIndexedType(storeType, indices);
       constMembers[memberIndex] =
-          buildConstStoreRecursively(spvType->getStructMemberType(i), memberStoreType->getPointerTo(addrSpace),
+          buildConstStoreRecursively(spvType->getStructMemberType(i), getBuilder()->getPtrTy(addrSpace),
                                      memberStoreType, constStoreValue->getAggregateElement(i));
     }
 
@@ -2302,9 +2361,8 @@ Constant *SPIRVToLLVM::buildConstStoreRecursively(SPIRVType *const spvType, Type
         indices.push_back(zero);
 
       Type *const elementStoreType = GetElementPtrInst::getIndexedType(storeType, indices);
-      Constant *const constElement =
-          buildConstStoreRecursively(spvElementType, elementStoreType->getPointerTo(addrSpace), elementStoreType,
-                                     constStoreValue->getAggregateElement(i));
+      Constant *const constElement = buildConstStoreRecursively(
+          spvElementType, getBuilder()->getPtrTy(addrSpace), elementStoreType, constStoreValue->getAggregateElement(i));
 
       if (needsPad) {
         constElements[i] = llvm::ConstantFoldInsertValueInstruction(constElements[i], constElement, 0);
@@ -2343,15 +2401,15 @@ Constant *SPIRVToLLVM::buildConstStoreRecursively(SPIRVType *const spvType, Type
 // Translate scope from SPIR-V to LLVM.
 //
 // @param context : The LLVM context.
-// @param spvScope : The scope to translate.
-static SyncScope::ID transScope(LLVMContext &context, const SPIRVConstant *const spvScope) {
-  const unsigned scope = static_cast<unsigned>(spvScope->getZExtIntValue());
-
+// @param scope : The SPIR-V scope to translate.
+static SyncScope::ID transScope(LLVMContext &context, unsigned scope) {
   switch (scope) {
   case ScopeCrossDevice:
+    return SyncScope::System;
   case ScopeDevice:
   case ScopeQueueFamilyKHR:
-    return SyncScope::System;
+  case ScopeShaderCallKHR:
+    return context.getOrInsertSyncScopeID("agent");
   case ScopeInvocation:
     return SyncScope::SingleThread;
   case ScopeWorkgroup:
@@ -2398,7 +2456,8 @@ static AtomicOrdering transMemorySemantics(const SPIRVConstant *const spvMemoryS
 Value *SPIRVToLLVM::transAtomicRMW(SPIRVValue *const spvValue, const AtomicRMWInst::BinOp binOp) {
   SPIRVAtomicInstBase *const spvAtomicInst = static_cast<SPIRVAtomicInstBase *>(spvValue);
 
-  const SyncScope::ID scope = transScope(*m_context, static_cast<SPIRVConstant *>(spvAtomicInst->getOpValue(1)));
+  const SyncScope::ID scope =
+      transScope(*m_context, static_cast<SPIRVConstant *>(spvAtomicInst->getOpValue(1))->getZExtIntValue());
   const AtomicOrdering ordering = transMemorySemantics(static_cast<SPIRVConstant *>(spvAtomicInst->getOpValue(2)));
 
   Value *const atomicPointer = transValue(spvAtomicInst->getOpValue(0), getBuilder()->GetInsertBlock()->getParent(),
@@ -2436,7 +2495,8 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpAtomicLoad>(SPIRVValue *c
 
   SPIRVAtomicLoad *const spvAtomicLoad = static_cast<SPIRVAtomicLoad *>(spvValue);
 
-  const SyncScope::ID scope = transScope(*m_context, static_cast<SPIRVConstant *>(spvAtomicLoad->getOpValue(1)));
+  const SyncScope::ID scope =
+      transScope(*m_context, static_cast<SPIRVConstant *>(spvAtomicLoad->getOpValue(1))->getZExtIntValue());
   const AtomicOrdering ordering = transMemorySemantics(static_cast<SPIRVConstant *>(spvAtomicLoad->getOpValue(2)),
                                                        /*readOnly=*/true, /*writeOnly=*/false);
 
@@ -2464,7 +2524,8 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpAtomicStore>(SPIRVValue *
 
   SPIRVAtomicStore *const spvAtomicStore = static_cast<SPIRVAtomicStore *>(spvValue);
 
-  const SyncScope::ID scope = transScope(*m_context, static_cast<SPIRVConstant *>(spvAtomicStore->getOpValue(1)));
+  const SyncScope::ID scope =
+      transScope(*m_context, static_cast<SPIRVConstant *>(spvAtomicStore->getOpValue(1))->getZExtIntValue());
   const AtomicOrdering ordering = transMemorySemantics(static_cast<SPIRVConstant *>(spvAtomicStore->getOpValue(2)),
                                                        /*readOnly=*/false, /*writeOnly=*/true);
 
@@ -2663,7 +2724,8 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpAtomicIIncrement>(SPIRVVa
 
   SPIRVAtomicInstBase *const spvAtomicInst = static_cast<SPIRVAtomicInstBase *>(spvValue);
 
-  const SyncScope::ID scope = transScope(*m_context, static_cast<SPIRVConstant *>(spvAtomicInst->getOpValue(1)));
+  const SyncScope::ID scope =
+      transScope(*m_context, static_cast<SPIRVConstant *>(spvAtomicInst->getOpValue(1))->getZExtIntValue());
   const AtomicOrdering ordering = transMemorySemantics(static_cast<SPIRVConstant *>(spvAtomicInst->getOpValue(2)));
 
   Value *const atomicPointer = transValue(spvAtomicInst->getOpValue(0), getBuilder()->GetInsertBlock()->getParent(),
@@ -2690,7 +2752,8 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpAtomicIDecrement>(SPIRVVa
 
   SPIRVAtomicInstBase *const spvAtomicInst = static_cast<SPIRVAtomicInstBase *>(spvValue);
 
-  const SyncScope::ID scope = transScope(*m_context, static_cast<SPIRVConstant *>(spvAtomicInst->getOpValue(1)));
+  const SyncScope::ID scope =
+      transScope(*m_context, static_cast<SPIRVConstant *>(spvAtomicInst->getOpValue(1))->getZExtIntValue());
   const AtomicOrdering ordering = transMemorySemantics(static_cast<SPIRVConstant *>(spvAtomicInst->getOpValue(2)));
 
   Value *const atomicPointer = transValue(spvAtomicInst->getOpValue(0), getBuilder()->GetInsertBlock()->getParent(),
@@ -2718,7 +2781,8 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpAtomicCompareExchange>(SP
 
   SPIRVAtomicInstBase *const spvAtomicInst = static_cast<SPIRVAtomicInstBase *>(spvValue);
 
-  const SyncScope::ID scope = transScope(*m_context, static_cast<SPIRVConstant *>(spvAtomicInst->getOpValue(1)));
+  const SyncScope::ID scope =
+      transScope(*m_context, static_cast<SPIRVConstant *>(spvAtomicInst->getOpValue(1))->getZExtIntValue());
   const AtomicOrdering successOrdering =
       transMemorySemantics(static_cast<SPIRVConstant *>(spvAtomicInst->getOpValue(2)));
   AtomicOrdering failureOrdering = transMemorySemantics(static_cast<SPIRVConstant *>(spvAtomicInst->getOpValue(3)),
@@ -2813,9 +2877,7 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpCopyMemory>(SPIRVValue *c
     spvCopyMemLoadType = spvLoadType->getPointerElementType();
     spvCopyMemStoreType = spvStoreType->getPointerElementType();
   }
-  LayoutMode loadLayout = isStorageClassExplicitlyLaidOut(m_bm, spvLoadType->getPointerStorageClass())
-                              ? LayoutMode::Explicit
-                              : LayoutMode::Native;
+  LayoutMode loadLayout = getLayoutModeForStorageClass(spvLoadType->getPointerStorageClass());
   Type *const loadType = transType(spvCopyMemLoadType, 0, true, loadLayout);
   bool isNonTemporal = spvCopyMemory->SPIRVMemoryAccess::isNonTemporal(true);
   Value *const load =
@@ -2824,9 +2886,7 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpCopyMemory>(SPIRVValue *c
   Value *const storePointer = transValue(spvCopyMemory->getTarget(), getBuilder()->GetInsertBlock()->getParent(),
                                          getBuilder()->GetInsertBlock());
 
-  LayoutMode storeLayout = isStorageClassExplicitlyLaidOut(m_bm, spvStoreType->getPointerStorageClass())
-                               ? LayoutMode::Explicit
-                               : LayoutMode::Native;
+  LayoutMode storeLayout = getLayoutModeForStorageClass(spvStoreType->getPointerStorageClass());
 
   Type *const storeType = transType(spvCopyMemStoreType, 0, true, storeLayout);
   isNonTemporal = spvCopyMemory->SPIRVMemoryAccess::isNonTemporal(false);
@@ -2879,6 +2939,8 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpLoad>(SPIRVValue *const s
       layout = LayoutMode::Std430;
       break;
     }
+  } else if (isStorageClassScalarLayout(storageClassKind)) {
+    layout = LayoutMode::Scalar;
   }
 
   bool isVolatile = spvLoad->SPIRVMemoryAccess::isVolatile(true);
@@ -3347,6 +3409,8 @@ SmallVector<Value *> SPIRVToLLVM::transAccessChain(SPIRVValue *const spvValue) {
       layout = isAccelerationStructureType(spvUltimateElementType) ? LayoutMode::Explicit : LayoutMode::Std430;
       break;
     }
+  } else if (isStorageClassScalarLayout(storageClass)) {
+    layout = LayoutMode::Scalar;
   }
 
   // Determine whether result/base is the mixed image/non-image case
@@ -3402,7 +3466,7 @@ SmallVector<Value *> SPIRVToLLVM::transAccessChain(SPIRVValue *const spvValue) {
 
     const SPIRVStorageClassKind pointerStorageClass = spvBaseType->getPointerStorageClass();
 
-    const bool typeMaybeRemapped = isStorageClassExplicitlyLaidOut(m_bm, pointerStorageClass) ||
+    const bool typeMaybeRemapped = (getLayoutModeForStorageClass(pointerStorageClass) != LayoutMode::Native) ||
                                    pointerStorageClass == StorageClassUniformConstant;
 
     Type *basePointeeType = nullptr;
@@ -3465,8 +3529,7 @@ SmallVector<Value *> SPIRVToLLVM::transAccessChain(SPIRVValue *const spvValue) {
         if (castType) {
           flushGep();
           basePointeeType = castType;
-          base = getBuilder()->CreateBitCast(base,
-                                             basePointeeType->getPointerTo(base->getType()->getPointerAddressSpace()));
+          base = getBuilder()->CreateBitCast(base, getBuilder()->getPtrTy(base->getType()->getPointerAddressSpace()));
         }
 
         spvAccessElementType = spvAccessElementType->getStructMemberType(origMemberIndex);
@@ -3555,7 +3618,7 @@ SmallVector<Value *> SPIRVToLLVM::transAccessChain(SPIRVValue *const spvValue) {
             getBuilder()->getInt32((unsigned)elemType),
             getBuilder()->getInt32((unsigned)layout),
         };
-        Type *retType = basePointeeType->getPointerTo(base->getType()->getPointerAddressSpace());
+        Type *retType = getBuilder()->getPtrTy(base->getType()->getPointerAddressSpace());
         appendTypeMangling(retType, args, mangledName);
         base = getBuilder()->CreateNamedCall(mangledName, retType, args, {Attribute::ReadNone, Attribute::NoUnwind});
 
@@ -4265,7 +4328,7 @@ Value *SPIRV::SPIRVToLLVM::createTraceRayDialectOp(SPIRVValue *const spvValue) {
 
   auto accelStructAsI64 = getBuilder()->CreateBitCast(accelStruct, getBuilder()->getInt64Ty());
 
-  Type *payloadTy = transType(spvOperands[10]->getType()->getPointerElementType());
+  Type *payloadTy = transType(spvOperands[10]->getType()->getPointerElementType(), 0, true, LayoutMode::Scalar);
 
   // Wrap payload with struct, PAQ handling expects a struct type.
   // FIXME: We should support non-struct types for PAQ
@@ -4423,18 +4486,19 @@ Value *SPIRVToLLVM::transGroupArithOp(Builder::GroupArithOp groupArithOp, SPIRVV
 
   BasicBlock *const block = getBuilder()->GetInsertBlock();
   Function *const func = getBuilder()->GetInsertBlock()->getParent();
-
   Value *const value = transValue(spvOperands[2], func, block);
+  Value *const clusterSize =
+      spvOperands.size() > 3 ? transValue(spvOperands[3], func, block) : getBuilder()->CreateGetWaveSize();
 
   switch (static_cast<SPIRVConstant *>(spvOperands[1])->getZExtIntValue()) {
   case GroupOperationReduce:
-    return getBuilder()->CreateSubgroupClusteredReduction(groupArithOp, value, getBuilder()->CreateGetWaveSize());
+    return getBuilder()->CreateSubgroupClusteredReduction(groupArithOp, value, clusterSize);
   case GroupOperationInclusiveScan:
-    return getBuilder()->CreateSubgroupClusteredInclusive(groupArithOp, value, getBuilder()->CreateGetWaveSize());
+    return getBuilder()->CreateSubgroupClusteredInclusive(groupArithOp, value, clusterSize);
   case GroupOperationExclusiveScan:
-    return getBuilder()->CreateSubgroupClusteredExclusive(groupArithOp, value, getBuilder()->CreateGetWaveSize());
+    return getBuilder()->CreateSubgroupClusteredExclusive(groupArithOp, value, clusterSize);
   case GroupOperationClusteredReduce:
-    return getBuilder()->CreateSubgroupClusteredReduction(groupArithOp, value, transValue(spvOperands[3], func, block));
+    return getBuilder()->CreateSubgroupClusteredReduction(groupArithOp, value, clusterSize);
   default:
     llvm_unreachable("Should never be called!");
     return nullptr;
@@ -5001,7 +5065,7 @@ Value *SPIRVToLLVM::transVariableNonImage(SPIRVValue *const spvValue) {
   const SPIRVStorageClassKind storageClass = spvVar->getStorageClass();
   SPIRVType *spvVarType = spvVar->getMemObjType();
 
-  LayoutMode layout = isStorageClassExplicitlyLaidOut(m_bm, storageClass) ? LayoutMode::Explicit : LayoutMode::Native;
+  LayoutMode layout = getLayoutModeForStorageClass(storageClass);
   if (storageClass == StorageClassUniformConstant) {
     SPIRVType *spvElementType = spvVarType;
     while (spvElementType->getOpCode() == OpTypeArray || spvElementType->getOpCode() == OpTypeRuntimeArray)
@@ -5023,8 +5087,12 @@ Value *SPIRVToLLVM::transVariableNonImage(SPIRVValue *const spvValue) {
   Type *const ptrType = transType(spvVar->getType());
   unsigned addrSpace = ptrType->getPointerAddressSpace();
   auto llpcContext = static_cast<Llpc::Context *>(m_context);
-  auto buildInfo = static_cast<const Vkgc::GraphicsPipelineBuildInfo *>(llpcContext->getPipelineBuildInfo());
-
+  // enableInitUndefZero is only supported for Graphics pipelines - assume false otherwise
+  bool enableInitUndefZero = false;
+  if (llpcContext->getPipelineType() == PipelineType::Graphics) {
+    auto buildInfo = static_cast<const Vkgc::GraphicsPipelineBuildInfo *>(llpcContext->getPipelineBuildInfo());
+    enableInitUndefZero = buildInfo->enableInitUndefZero;
+  }
   Type *const varType = transType(spvVarType, 0, true, layout);
 
   if (m_hasSamplerInStruct && storageClass == StorageClassUniformConstant) {
@@ -5063,8 +5131,8 @@ Value *SPIRVToLLVM::transVariableNonImage(SPIRVValue *const spvValue) {
       // Initialize user-defined output variable to zero
       initializer = Constant::getNullValue(varType);
     }
-  } else if (buildInfo->enableInitUndefZero && (storageClass == SPIRVStorageClassKind::StorageClassPrivate ||
-                                                storageClass == SPIRVStorageClassKind::StorageClassFunction)) {
+  } else if (enableInitUndefZero && (storageClass == SPIRVStorageClassKind::StorageClassPrivate ||
+                                     storageClass == SPIRVStorageClassKind::StorageClassFunction)) {
     initializer = Constant::getNullValue(varType);
   }
 
@@ -5359,25 +5427,6 @@ Value *SPIRVToLLVM::transString(const SPIRVString *spvValue) {
 // |  iu4     |  i32     |   Y   |  Y  |
 // For integer types, arbitrary signedness combinations are supported for the
 // A/B matrices.C/D matrices are always signed.
-
-lgc::CooperativeMatrixElementType SPIRVToLLVM::mapToBasicType(Type *const elemType) {
-  lgc::CooperativeMatrixElementType basicTy = lgc::CooperativeMatrixElementType::Unknown;
-  if (elemType->isIntegerTy(8)) {
-    basicTy = lgc::CooperativeMatrixElementType::Int8;
-  } else if (elemType->isIntegerTy(16)) {
-    basicTy = lgc::CooperativeMatrixElementType::Int16;
-  } else if (elemType->isIntegerTy(32)) {
-    basicTy = lgc::CooperativeMatrixElementType::Int32;
-  } else if (elemType->isFloatTy()) {
-    basicTy = lgc::CooperativeMatrixElementType::Float32;
-  } else if (elemType->isHalfTy()) {
-    basicTy = lgc::CooperativeMatrixElementType::Float16;
-  } else {
-    llvm_unreachable("The element type is not supported!");
-  }
-  return basicTy;
-}
-
 lgc::CooperativeMatrixElementType SPIRVToLLVM::mapToBasicType(SPIRVType *const elemType) {
   lgc::CooperativeMatrixElementType basicTy = lgc::CooperativeMatrixElementType::Unknown;
   if (elemType->isTypeInt(8)) {
@@ -5405,7 +5454,7 @@ lgc::CooperativeMatrixElementType SPIRVToLLVM::mapToBasicType(SPIRVType *const e
 lgc::CooperativeMatrixLayout SPIRVToLLVM::getCooperativeMatrixKHRLayout(CooperativeMatrixUse use,
                                                                         lgc::CooperativeMatrixElementType elemType,
                                                                         unsigned rows, unsigned columns) {
-  const Vkgc::GfxIpVersion gfxIp = getPipelineContext()->getGfxIpVersion();
+  [[maybe_unused]] const Vkgc::GfxIpVersion gfxIp = getPipelineContext()->getGfxIpVersion();
   if (use == CooperativeMatrixUse::CooperativeMatrixUseMatrixAKHR ||
       use == CooperativeMatrixUse::CooperativeMatrixUseMatrixBKHR) {
 
@@ -5436,7 +5485,8 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpCooperativeMatrixLengthKH
   unsigned rows = matrixType->getCooperativeMatrixKHRRows();
   unsigned columns = matrixType->getCooperativeMatrixKHRColumns();
   auto layout = getCooperativeMatrixKHRLayout(matrixUse, elemType, rows, columns);
-  return getBuilder()->create<CooperativeMatrixLengthOp>(layout);
+  const unsigned kSize = rows > columns ? rows : columns;
+  return getBuilder()->create<CooperativeMatrixLengthOp>(layout, kSize);
 }
 
 // =====================================================================================================================
@@ -5525,9 +5575,10 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpCooperativeMatrixLoadKHR>
   unsigned alignmentInRowCol = (isColMajor ? rows : columns) * elementSize;
   unsigned loadAlignment = std::min((unsigned)16, alignmentInRowCol);
   lgc::CooperativeMatrixLayout layout = getCooperativeMatrixKHRLayout(use, elemType, rows, columns);
-  Type *coopMatrixTy = getBuilder()->getCooperativeMatrixTy(elemType, layout);
+  const unsigned kSize = rows > columns ? rows : columns;
+  Type *coopMatrixTy = getBuilder()->getCooperativeMatrixTy(elemType, layout, kSize);
   auto CoopMatLoadInst = getBuilder()->create<CooperativeMatrixLoadOp>(
-      coopMatrixTy, pointer, stride, isColMajor, elemType, layout, memoryAccess, loadAlignment, "load");
+      coopMatrixTy, pointer, stride, isColMajor, elemType, layout, memoryAccess, loadAlignment, kSize, "load");
   return CoopMatLoadInst;
 }
 
@@ -5591,7 +5642,8 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpCooperativeMatrixStoreKHR
 
   // Cal elemType
   Type *const elemltType = transType(coopMatStore->getObject()->getType()->getCooperativeMatrixKHRComponentType());
-  lgc::CooperativeMatrixElementType elemType = mapToBasicType(elemltType);
+  lgc::CooperativeMatrixElementType elemType =
+      mapToBasicType(coopMatStore->getObject()->getType()->getCooperativeMatrixKHRComponentType());
 
   CooperativeMatrixUse use =
       static_cast<CooperativeMatrixUse>(coopMatStore->getObject()->getType()->getCooperativeMatrixKHRUse());
@@ -5623,8 +5675,9 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpCooperativeMatrixStoreKHR
   elementSize = std::max(elementSize, (unsigned)1);
   unsigned alignmentInRowCol = (isColMajor ? rows : columns) * elementSize;
   unsigned storeAlignment = std::min((unsigned)16, alignmentInRowCol);
+  const unsigned kSize = rows > columns ? rows : columns;
   getBuilder()->create<CooperativeMatrixStoreOp>(pointer, stride, isColMajor, elemType, layout, memoryAccess,
-                                                 storeAlignment, matrix);
+                                                 storeAlignment, matrix, kSize);
   return nullptr;
 }
 
@@ -5650,9 +5703,14 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpCooperativeMatrixMulAddKH
   bool isSignedB = static_cast<bool>(static_cast<SPIRVCooperativeMatrixMulAddKHR *>(spvInst)->getMatrixBSigned());
   bool isSat = static_cast<bool>(static_cast<SPIRVCooperativeMatrixMulAddKHR *>(spvInst)->getMatrixSatAccumulation());
 
+  [[maybe_unused]] const Vkgc::GfxIpVersion gfxIp = getPipelineContext()->getGfxIpVersion();
+  unsigned kMultiplier = 1;
+
+  Type *coopMatrixDType = coopMatrixC->getType();
+  lgc::CooperativeMatrixElementType elemBasicTypeD = elemBasicTypeC;
   Value *coopMatrixD = getBuilder()->create<CooperativeMatrixMulAddOp>(
-      coopMatrixC->getType(), coopMatrixA, coopMatrixB, coopMatrixC, isSignedA, isSignedB, isSat, 0, elemBasicTypeA,
-      elemBasicTypeA, elemBasicTypeC, "mulAdd");
+      coopMatrixDType, coopMatrixA, coopMatrixB, coopMatrixC, isSignedA, isSignedB, isSat, 0, elemBasicTypeA,
+      elemBasicTypeA, elemBasicTypeC, elemBasicTypeD, kMultiplier, "mulAdd");
   return coopMatrixD;
 }
 
@@ -5793,6 +5851,52 @@ SmallVector<Value *> SPIRVToLLVM::transValueWithoutDecoration(SPIRVValue *bv, Fu
     }
   }
 
+  case OpConstantCompositeReplicateEXT:
+  case OpSpecConstantCompositeReplicateEXT: {
+    auto bcc = static_cast<SPIRVConstantCompositeReplicateEXT *>(bv);
+    auto bccElements = bcc->getElements();
+
+    std::vector<Constant *> cv;
+
+    switch (bv->getType()->getOpCode()) {
+    case OpTypeVector: {
+      unsigned compCount = ((SPIRV::SPIRVTypeVector *)bv->getType())->getComponentCount();
+      for (unsigned i = 0; i < compCount; ++i) {
+        cv.push_back(dyn_cast<Constant>(transValue(bccElements[0], f, bb)));
+      }
+      return mapValue(bv, ConstantVector::get(cv));
+    }
+    case OpTypeArray: {
+      unsigned length = ((SPIRV::SPIRVTypeArray *)bv->getType())->getLength()->getZExtIntValue();
+      for (unsigned i = 0; i < length; ++i) {
+        cv.push_back(dyn_cast<Constant>(transValue(bccElements[0], f, bb)));
+      }
+      return mapValue(bv, ConstantArray::get(dyn_cast<ArrayType>(transType(bcc->getType())), cv));
+    }
+    case OpTypeStruct: {
+      unsigned memCount = ((SPIRV::SPIRVTypeStruct *)bv->getType())->getMemberCount();
+      for (unsigned i = 0; i < memCount; ++i) {
+        cv.push_back(dyn_cast<Constant>(transValue(bccElements[0], f, bb)));
+      }
+      return mapValue(bv, ConstantStruct::get(dyn_cast<StructType>(transType(bcc->getType())), cv));
+    }
+    case OpTypeMatrix: {
+      unsigned matCount = bv->getType()->getMatrixColumnCount();
+      for (unsigned i = 0; i < matCount; ++i) {
+        cv.push_back(dyn_cast<Constant>(transValue(bccElements[0], f, bb)));
+      }
+      return mapValue(bv, ConstantArray::get(dyn_cast<ArrayType>(transType(bcc->getType())), cv));
+    }
+    case OpTypeCooperativeMatrixKHR: {
+      auto elements = transValue(bcc->getElements(), f, bb);
+      return mapValue(bv, transCooperativeMatrixKHRFromConstruct(bcc->getType(), elements));
+    }
+    default:
+      llvm_unreachable("not implemented");
+      return {};
+    }
+  }
+
   case OpSpecConstantOp: {
     auto bi = static_cast<SPIRVSpecConstantOp *>(bv)->getMappedConstant();
     return mapValue(bv, transValue(bi, nullptr, nullptr, false));
@@ -6139,6 +6243,58 @@ SmallVector<Value *> SPIRVToLLVM::transValueWithoutDecoration(SPIRVValue *bv, Fu
     }
   }
 
+  case OpCompositeConstructReplicateEXT: {
+    auto cc = static_cast<SPIRVCompositeConstructReplicateEXT *>(bv);
+    auto constituents = transValue(cc->getConstituents(), f, bb);
+
+    switch (bv->getType()->getOpCode()) {
+    case OpTypeVector: {
+      auto vecTy = transType(cc->getType());
+      unsigned compCount = ((SPIRV::SPIRVTypeVector *)cc->getType())->getComponentCount();
+      Value *v = PoisonValue::get(vecTy);
+      for (unsigned i = 0; i < compCount; ++i) {
+        v = InsertElementInst::Create(v, constituents[0], ConstantInt::get(*m_context, APInt(32, i)), "", bb);
+      }
+      return mapValue(bv, v);
+    }
+    case OpTypeArray: {
+      auto aryTy = transType(cc->getType());
+      Value *v = PoisonValue::get(aryTy);
+      unsigned length = ((SPIRV::SPIRVTypeArray *)cc->getType())->getLength()->getZExtIntValue();
+      for (unsigned i = 0; i < length; ++i) {
+        v = InsertValueInst::Create(v, constituents[0], i, "", bb);
+      }
+      return mapValue(bv, v);
+    }
+    case OpTypeStruct: {
+      auto ccTy = transType(cc->getType());
+      Value *v = PoisonValue::get(ccTy);
+      unsigned memCount = ((SPIRV::SPIRVTypeStruct *)cc->getType())->getMemberCount();
+      for (unsigned i = 0; i < memCount; ++i) {
+        v = InsertValueInst::Create(v, constituents[0], i, "", bb);
+      }
+      return mapValue(bv, v);
+    }
+    case OpTypeMatrix: {
+      auto bvTy = bv->getType();
+      auto matClmTy = transType(bvTy->getMatrixColumnType());
+      auto matCount = bvTy->getMatrixColumnCount();
+      auto matTy = ArrayType::get(matClmTy, matCount);
+
+      Value *v = PoisonValue::get(matTy);
+      for (unsigned i = 0; i < matCount; ++i) {
+        v = InsertValueInst::Create(v, constituents[0], i, "", bb);
+      }
+      return mapValue(bv, v);
+    }
+    case OpTypeCooperativeMatrixKHR: {
+      return mapValue(bv, transCooperativeMatrixKHRFromConstruct(cc->getType(), constituents));
+    }
+    default:
+      llvm_unreachable("Unhandled type!");
+    }
+  }
+
   case OpCompositeExtract: {
     SPIRVCompositeExtract *ce = static_cast<SPIRVCompositeExtract *>(bv);
     if (ce->getComposite()->getType()->isTypeVector()) {
@@ -6345,66 +6501,6 @@ SmallVector<Value *> SPIRVToLLVM::transValueWithoutDecoration(SPIRVValue *bv, Fu
     return mapValue(bv, fNeg);
   }
 
-  case OpFConvert: {
-    SPIRVUnary *bc = static_cast<SPIRVUnary *>(bv);
-    Value *val = transValue(bc->getOperand(0), f, bb);
-    Type *destTy = transType(bc->getType());
-    // Can't use destTy as for transType will return packed element Type.
-    CastInst::CastOps co = Instruction::BitCast;
-    if (bv->getType()->isTypeCooperativeMatrixKHR()) {
-      SPIRVType *dstType = bc->getType()->getCooperativeMatrixKHRComponentType();
-      lgc::CooperativeMatrixElementType basicDstElemTy = mapToBasicType(dstType);
-      SPIRVType *srcType = bc->getOperand(0)->getType()->getCooperativeMatrixKHRComponentType();
-      bool isExt = dstType->getBitWidth() > srcType->getBitWidth();
-      co = isExt ? Instruction::FPExt : Instruction::FPTrunc;
-      lgc::CooperativeMatrixElementType basicSrcElemTy = mapToBasicType(srcType);
-      lgc::CooperativeMatrixLayout srcLayout = getCooperativeMatrixKHRLayout(
-          static_cast<CooperativeMatrixUse>(bc->getType()->getCooperativeMatrixKHRUse()), basicSrcElemTy,
-          bc->getType()->getCooperativeMatrixKHRRows(), bc->getType()->getCooperativeMatrixKHRColumns());
-      lgc::CooperativeMatrixLayout dstLayout = getCooperativeMatrixKHRLayout(
-          static_cast<CooperativeMatrixUse>(bc->getType()->getCooperativeMatrixKHRUse()), basicDstElemTy,
-          bc->getType()->getCooperativeMatrixKHRRows(), bc->getType()->getCooperativeMatrixKHRColumns());
-
-      Type *matrixType = getBuilder()->getCooperativeMatrixTy(basicDstElemTy, dstLayout);
-      return mapValue(bv, getBuilder()->create<CooperativeMatrixConvertOp>(
-                              matrixType, co, val, basicSrcElemTy, basicDstElemTy, srcLayout, dstLayout, "fConvert"));
-    }
-    unsigned valTypeBitWide = val->getType()->getScalarType()->getPrimitiveSizeInBits();
-    unsigned destTypeBitWide = destTy->getScalarType()->getPrimitiveSizeInBits();
-    if (valTypeBitWide < destTypeBitWide)
-      return mapValue(bv, getBuilder()->CreateFPExt(val, destTy));
-    else if (valTypeBitWide == destTypeBitWide) {
-      assert(val->getType()->getScalarType()->isBFloatTy() || val->getType()->getScalarType()->isHalfTy());
-      val = getBuilder()->CreateFPExt(
-          val, destTy->isVectorTy()
-                   ? FixedVectorType::get(getBuilder()->getFloatTy(), cast<FixedVectorType>(destTy)->getNumElements())
-                   : getBuilder()->getFloatTy());
-    }
-
-    RoundingMode rm = RoundingMode::Dynamic;
-    SPIRVFPRoundingModeKind rounding;
-    if (bc->hasFPRoundingMode(&rounding)) {
-      switch (rounding) {
-      case FPRoundingModeRTE:
-        rm = RoundingMode::NearestTiesToEven;
-        break;
-      case FPRoundingModeRTZ:
-        rm = RoundingMode::TowardZero;
-        break;
-      case FPRoundingModeRTP:
-        rm = RoundingMode::TowardPositive;
-        break;
-      case FPRoundingModeRTN:
-        rm = RoundingMode::TowardNegative;
-        break;
-      default:
-        llvm_unreachable("Should never be called!");
-      }
-      return mapValue(bv, getBuilder()->CreateFpTruncWithRounding(val, destTy, rm));
-    }
-    return mapValue(bv, getBuilder()->CreateFPTrunc(val, destTy));
-  }
-
   case OpBitCount: {
     SPIRVUnary *bc = static_cast<SPIRVUnary *>(bv);
     Value *val = transValue(bc->getOperand(0), f, bb);
@@ -7354,7 +7450,7 @@ static unsigned convertDimension(const SPIRVTypeImageDescriptor *desc) {
 // =============================================================================
 // Scan backwards from an image/sampler or pointer-to-image/sampler value and set non-uniform/coherent/volatile flags.
 static void scanImageDescNonUniformCV(SPIRVToLLVM::ExtractedImageInfo *info, SPIRVValue *spvValue, bool image,
-                                      bool sampler) {
+                                      bool sampler, const PipelineShaderOptions *shaderOption) {
   for (;;) {
     if (image) {
       if (spvValue->hasDecorate(DecorationCoherent))
@@ -7395,9 +7491,9 @@ static void scanImageDescNonUniformCV(SPIRVToLLVM::ExtractedImageInfo *info, SPI
     if (opcode == OpSampledImage) {
       auto *sampledImage = static_cast<SPIRVSampledImage *>(spvValue);
       if (image)
-        scanImageDescNonUniformCV(info, sampledImage->getOperands()[0], true, false);
+        scanImageDescNonUniformCV(info, sampledImage->getOperands()[0], true, false, shaderOption);
       if (sampler)
-        scanImageDescNonUniformCV(info, sampledImage->getOperands()[1], false, true);
+        scanImageDescNonUniformCV(info, sampledImage->getOperands()[1], false, true, shaderOption);
       break;
     }
 
@@ -7546,7 +7642,8 @@ void SPIRVToLLVM::getImageDesc(SPIRVValue *bImageInst, ExtractedImageInfo *info)
       info->flags |= lgc::Builder::ImageFlagNonUniformSampler;
   }
 
-  scanImageDescNonUniformCV(info, bImageInst, components & ImageComponentImage, components & ImageComponentSampler);
+  scanImageDescNonUniformCV(info, bImageInst, components & ImageComponentImage, components & ImageComponentSampler,
+                            m_shaderOptions);
   bool imageUniform = (components & ImageComponentImage) && !(info->flags & lgc::Builder::ImageFlagNonUniformImage);
   bool samplerUniform =
       (components & ImageComponentSampler) && !(info->flags & lgc::Builder::ImageFlagNonUniformSampler);
@@ -8909,25 +9006,32 @@ bool SPIRVToLLVM::transMetadata() {
         for (unsigned i = 0, e = m_bm->getNumConstants(); i != e; ++i) {
           auto bv = m_bm->getConstant(i);
           SPIRVWord builtIn = SPIRVID_INVALID;
-          if ((bv->getOpCode() == OpSpecConstant || bv->getOpCode() == OpSpecConstantComposite ||
-               bv->getOpCode() == OpConstant || bv->getOpCode() == OpConstantComposite) &&
-              bv->hasDecorate(DecorationBuiltIn, 0, &builtIn)) {
-            if (builtIn == spv::BuiltInWorkgroupSize) {
-              // NOTE: Overwrite values of local sizes specified in execution
-              // mode if the constant corresponding to gl_WorkGroupSize
-              // exists. Take its value since gl_WorkGroupSize could be a
-              // specialization constant.
-              auto workGroupSize = static_cast<SPIRVSpecConstantComposite *>(bv);
-
+          if (bv->hasDecorate(DecorationBuiltIn, 0, &builtIn) && (builtIn == spv::BuiltInWorkgroupSize)) {
+            // NOTE: Overwrite values of local sizes specified in execution
+            // mode if the constant corresponding to gl_WorkGroupSize
+            // exists. Take its value since gl_WorkGroupSize could be a
+            // specialization constant.
+            auto workGroupSize = static_cast<SPIRVSpecConstantComposite *>(bv);
+            if (bv->getOpCode() == OpSpecConstant || bv->getOpCode() == OpSpecConstantComposite ||
+                bv->getOpCode() == OpConstant || bv->getOpCode() == OpConstantComposite) {
               // Declared: const uvec3 gl_WorkGroupSize
               assert(workGroupSize->getElements().size() == 3);
-              auto workGroupSizeX = static_cast<SPIRVConstant *>(workGroupSize->getElements()[0]);
-              auto workGroupSizeY = static_cast<SPIRVConstant *>(workGroupSize->getElements()[1]);
-              auto workGroupSizeZ = static_cast<SPIRVConstant *>(workGroupSize->getElements()[2]);
+              meshMode.workgroupSizeX =
+                  static_cast<SPIRVConstant *>(workGroupSize->getElements()[0])->getZExtIntValue();
+              meshMode.workgroupSizeY =
+                  static_cast<SPIRVConstant *>(workGroupSize->getElements()[1])->getZExtIntValue();
+              meshMode.workgroupSizeZ =
+                  static_cast<SPIRVConstant *>(workGroupSize->getElements()[2])->getZExtIntValue();
 
-              meshMode.workgroupSizeX = workGroupSizeX->getZExtIntValue();
-              meshMode.workgroupSizeY = workGroupSizeY->getZExtIntValue();
-              meshMode.workgroupSizeZ = workGroupSizeZ->getZExtIntValue();
+              break;
+            } else if (bv->getOpCode() == OpConstantCompositeReplicateEXT ||
+                       bv->getOpCode() == OpSpecConstantCompositeReplicateEXT) {
+              meshMode.workgroupSizeX =
+                  static_cast<SPIRVConstant *>(workGroupSize->getElements()[0])->getZExtIntValue();
+              meshMode.workgroupSizeY =
+                  static_cast<SPIRVConstant *>(workGroupSize->getElements()[0])->getZExtIntValue();
+              meshMode.workgroupSizeZ =
+                  static_cast<SPIRVConstant *>(workGroupSize->getElements()[0])->getZExtIntValue();
 
               break;
             }
@@ -9024,22 +9128,27 @@ bool SPIRVToLLVM::transMetadata() {
         for (unsigned i = 0, e = m_bm->getNumConstants(); i != e; ++i) {
           auto bv = m_bm->getConstant(i);
           SPIRVWord builtIn = SPIRVID_INVALID;
-          if ((bv->getOpCode() == OpSpecConstant || bv->getOpCode() == OpSpecConstantComposite ||
-               bv->getOpCode() == OpConstant || bv->getOpCode() == OpConstantComposite) &&
-              bv->hasDecorate(DecorationBuiltIn, 0, &builtIn)) {
-            if (builtIn == spv::BuiltInWorkgroupSize) {
-              // NOTE: Overwrite values of local sizes specified in execution
-              // mode if the constant corresponding to gl_WorkGroupSize
-              // exists. Take its value since gl_WorkGroupSize could be a
-              // specialization constant.
-              auto workGroupSize = static_cast<SPIRVSpecConstantComposite *>(bv);
-
+          if (bv->hasDecorate(DecorationBuiltIn, 0, &builtIn) && (builtIn == spv::BuiltInWorkgroupSize)) {
+            // NOTE: Overwrite values of local sizes specified in execution
+            // mode if the constant corresponding to gl_WorkGroupSize
+            // exists. Take its value since gl_WorkGroupSize could be a
+            // specialization constant.
+            auto workGroupSize = static_cast<SPIRVSpecConstantComposite *>(bv);
+            if (bv->getOpCode() == OpSpecConstant || bv->getOpCode() == OpSpecConstantComposite ||
+                bv->getOpCode() == OpConstant || bv->getOpCode() == OpConstantComposite) {
               // Declared: const uvec3 gl_WorkGroupSize
               assert(workGroupSize->getElements().size() == 3);
               workgroupSizeX = static_cast<SPIRVConstant *>(workGroupSize->getElements()[0])->getZExtIntValue();
               workgroupSizeY = static_cast<SPIRVConstant *>(workGroupSize->getElements()[1])->getZExtIntValue();
               workgroupSizeZ = static_cast<SPIRVConstant *>(workGroupSize->getElements()[2])->getZExtIntValue();
 
+              break;
+            } else if (bv->getOpCode() == OpConstantCompositeReplicateEXT ||
+                       bv->getOpCode() == OpSpecConstantCompositeReplicateEXT) {
+              workgroupSizeX = static_cast<SPIRVConstant *>(workGroupSize->getElements()[0])->getZExtIntValue();
+              workgroupSizeY = static_cast<SPIRVConstant *>(workGroupSize->getElements()[0])->getZExtIntValue();
+              workgroupSizeZ = static_cast<SPIRVConstant *>(workGroupSize->getElements()[0])->getZExtIntValue();
+
               break;
             }
           }
@@ -10382,7 +10491,7 @@ Value *SPIRVToLLVM::transGLSLExtInst(SPIRVExtInst *extInst, BasicBlock *bb) {
     // storing.
     if (exp->getType()->isVectorTy()) {
       assert(args[1]->getType()->isPointerTy());
-      Type *const castType = exp->getType()->getPointerTo(args[1]->getType()->getPointerAddressSpace());
+      Type *const castType = getBuilder()->getPtrTy(args[1]->getType()->getPointerAddressSpace());
       args[1] = getBuilder()->CreateBitCast(args[1], castType);
     }
     getBuilder()->CreateStore(exp, args[1]);
@@ -10717,6 +10826,9 @@ void SPIRVToLLVM::transMemFence(BasicBlock *bb, SPIRVWord memSema, SPIRVWord mem
   if (ordering == AtomicOrdering::NotAtomic)
     return;
 
+  if (m_shaderOptions->forceMemoryBarrierScope)
+    memScope = m_shaderOptions->forceMemoryBarrierScope;
+
   // Downgrade ScopeDevice to ScopeWorkgroup if memory semantics permits it.
   // If memory semantics implies that shared memory is local to a workgroup, no need for ScopeDevice that would mean all
   // workgroups in the device.
@@ -10729,31 +10841,7 @@ void SPIRVToLLVM::transMemFence(BasicBlock *bb, SPIRVWord memSema, SPIRVWord mem
     memScope = ScopeWorkgroup;
   }
 
-  SyncScope::ID scope = SyncScope::System;
-
-  switch (memScope) {
-  case ScopeCrossDevice:
-    scope = SyncScope::System;
-    break;
-  case ScopeQueueFamilyKHR:
-  case ScopeShaderCallKHR:
-  case ScopeDevice:
-    scope = m_context->getOrInsertSyncScopeID("agent");
-    break;
-  case ScopeInvocation:
-    scope = SyncScope::SingleThread;
-    break;
-  case ScopeWorkgroup:
-    scope = m_context->getOrInsertSyncScopeID("workgroup");
-    break;
-  case ScopeSubgroup:
-    scope = m_context->getOrInsertSyncScopeID("wavefront");
-    break;
-  default:
-    llvm_unreachable("Invalid scope");
-  }
-
-  getBuilder()->CreateFence(ordering, scope);
+  getBuilder()->CreateFence(ordering, transScope(*m_context, memScope));
 }
 
 void SPIRVToLLVM::transBarrierFence(SPIRVInstruction *mb, BasicBlock *bb) {
@@ -11308,19 +11396,21 @@ Value *SPIRVToLLVM::transCooperativeMatrixArithInst(SPIRVValue *spvVal, BasicBlo
 
   lgc::CooperativeMatrixLayout layout = lgc::CooperativeMatrixLayout::InvalidLayout;
   lgc::CooperativeMatrixElementType elemType = lgc::CooperativeMatrixElementType::Unknown;
+  unsigned kSize = 16;
   if (oc == OpFNegate || oc == OpSNegate) {
     auto unary = static_cast<SPIRVUnary *>(spvVal);
     Value *srcVal = transValue(unary->getOperand(0), func, bb);
     if (unary->getOperand(0)->getType()->isTypeCooperativeMatrixKHR()) {
       SPIRVType *elemSpvType = unary->getOperand(0)->getType()->getCooperativeMatrixKHRComponentType();
+      elemType = mapToBasicType(elemSpvType);
       unsigned rows = unary->getOperand(0)->getType()->getCooperativeMatrixKHRRows();
       unsigned columns = unary->getOperand(0)->getType()->getCooperativeMatrixKHRColumns();
-      elemType = mapToBasicType(elemSpvType);
+      kSize = rows > columns ? rows : columns;
       layout = getCooperativeMatrixKHRLayout(
           static_cast<CooperativeMatrixUse>(unary->getOperand(0)->getType()->getCooperativeMatrixKHRUse()), elemType,
           rows, columns);
     }
-    Type *resultTy = getBuilder()->getCooperativeMatrixTy(elemType, layout);
+    Type *resultTy = getBuilder()->getCooperativeMatrixTy(elemType, layout, kSize);
     return getBuilder()->create<CooperativeMatrixBinaryOp>(resultTy, arithOp, Constant::getNullValue(srcVal->getType()),
                                                            srcVal, elemType, layout);
   } else {
@@ -11329,14 +11419,15 @@ Value *SPIRVToLLVM::transCooperativeMatrixArithInst(SPIRVValue *spvVal, BasicBlo
     Value *rhs = transValue(binary->getOperand(1), func, bb);
     if (binary->getOperand(0)->getType()->isTypeCooperativeMatrixKHR()) {
       SPIRVType *elemSpvType = binary->getOperand(0)->getType()->getCooperativeMatrixKHRComponentType();
+      elemType = mapToBasicType(elemSpvType);
       unsigned rows = binary->getOperand(0)->getType()->getCooperativeMatrixKHRRows();
       unsigned columns = binary->getOperand(0)->getType()->getCooperativeMatrixKHRColumns();
-      elemType = mapToBasicType(elemSpvType);
+      kSize = rows > columns ? rows : columns;
       layout = getCooperativeMatrixKHRLayout(
           static_cast<CooperativeMatrixUse>(binary->getOperand(0)->getType()->getCooperativeMatrixKHRUse()), elemType,
           rows, columns);
     }
-    Type *resultTy = getBuilder()->getCooperativeMatrixTy(elemType, layout);
+    Type *resultTy = getBuilder()->getCooperativeMatrixTy(elemType, layout, kSize);
     return getBuilder()->create<CooperativeMatrixBinaryOp>(resultTy, arithOp, lhs, rhs, elemType, layout);
   }
 }
@@ -11346,11 +11437,13 @@ Value *SPIRVToLLVM::transCooperativeMatrixArithInst(SPIRVValue *spvVal, BasicBlo
 Value *SPIRVToLLVM::transCooperativeMatrixKHRFromConstruct(SPIRVType *spvCoopMatTy,
                                                            const std::vector<Value *> &constituents) {
   lgc::CooperativeMatrixElementType elemType = mapToBasicType(spvCoopMatTy->getCooperativeMatrixKHRComponentType());
+  unsigned rows = spvCoopMatTy->getCooperativeMatrixKHRRows();
+  unsigned columns = spvCoopMatTy->getCooperativeMatrixKHRColumns();
+  const unsigned kSize = rows > columns ? rows : columns;
   lgc::CooperativeMatrixLayout layout = getCooperativeMatrixKHRLayout(
-      static_cast<CooperativeMatrixUse>(spvCoopMatTy->getCooperativeMatrixKHRUse()), elemType,
-      spvCoopMatTy->getCooperativeMatrixKHRRows(), spvCoopMatTy->getCooperativeMatrixKHRColumns());
-  Type *coopMatrixTy = getBuilder()->getCooperativeMatrixTy(elemType, layout);
-  return getBuilder()->create<CooperativeMatrixFillOp>(coopMatrixTy, constituents[0], elemType, layout);
+      static_cast<CooperativeMatrixUse>(spvCoopMatTy->getCooperativeMatrixKHRUse()), elemType, rows, columns);
+  Type *coopMatrixTy = getBuilder()->getCooperativeMatrixTy(elemType, layout, kSize);
+  return getBuilder()->create<CooperativeMatrixFillOp>(coopMatrixTy, constituents[0], elemType, layout, kSize);
 }
 
 } // namespace SPIRV
diff --git a/llpc/translator/lib/SPIRV/SPIRVReader.h b/llpc/translator/lib/SPIRV/SPIRVReader.h
index ef0e0c1fb3..42dcd27431 100644
--- a/llpc/translator/lib/SPIRV/SPIRVReader.h
+++ b/llpc/translator/lib/SPIRV/SPIRVReader.h
@@ -77,6 +77,7 @@ enum class LayoutMode : uint8_t {
   Native = 1,   ///< Using native LLVM rules for in-memory layout
   Explicit = 2, ///< Using layout decorations(like offset) from SPIRV
   Std430 = 3,   ///< Using std430 layout rule
+  Scalar = 4,   ///< Using scalar layout
 };
 
 // Describe what parts of image/sampler descriptors are present.
@@ -336,7 +337,6 @@ class SPIRVToLLVM {
     SmallVector<llvm::Instruction *, 1> llvmInstructions;
   };
 
-  lgc::CooperativeMatrixElementType mapToBasicType(Type *const ltType);
   lgc::CooperativeMatrixElementType mapToBasicType(SPIRVType *const spvType);
   lgc::CooperativeMatrixLayout getCooperativeMatrixKHRLayout(CooperativeMatrixUse use,
                                                              lgc::CooperativeMatrixElementType elemTy, unsigned rows,
@@ -456,6 +456,10 @@ class SPIRVToLLVM {
   Value *ConvertingSamplerSelectLadderHelper(Value *result, Value *convertingSamplerIdx,
                                              const std::function<Value *(Value *)> &createImageOp);
 
+  bool isStorageClassExplicitlyLaidOut(SPIRVStorageClassKind storageClass);
+  bool isStorageClassScalarLayout(SPIRVStorageClassKind storageClass);
+  LayoutMode getLayoutModeForStorageClass(SPIRVStorageClassKind storageClass);
+
   bool hasSpirvType(SPIRVType *spvTy, spv::Op ty);
   Value *createTraceRayDialectOp(SPIRVValue *const spvValue);
 
diff --git a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVInstruction.cpp b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVInstruction.cpp
index 5cededd88c..aad253c74d 100644
--- a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVInstruction.cpp
+++ b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVInstruction.cpp
@@ -216,6 +216,9 @@ uint64_t getConstantValue(SPIRVValue *BV, uint32_t I = 0) {
     ConstVal = static_cast<SPIRVConstantTrue *>(BV)->getBoolValue();
   else if (BV->getOpCode() == OpConstantComposite || BV->getOpCode() == OpSpecConstantComposite)
     ConstVal = getConstantValue(static_cast<SPIRVConstantComposite *>(BV)->getElements()[I]);
+  else if (BV->getOpCode() == OpCompositeConstructReplicateEXT ||
+           BV->getOpCode() == OpSpecConstantCompositeReplicateEXT)
+    ConstVal = getConstantValue(static_cast<SPIRVConstantCompositeReplicateEXT *>(BV)->getElements()[0]);
   else if (BV->getOpCode() == OpConstantNull || BV->getOpCode() == OpUndef)
     ConstVal = 0;
   else if (BV->getOpCode() == OpSpecConstantOp)
@@ -232,8 +235,13 @@ SPIRVValue *constantCompositeExtract(SPIRVValue *Composite, SPIRVType *ObjectTy,
   for (auto I : Indices) {
     if (Composite->getOpCode() == OpUndef || Composite->getOpCode() == OpConstantNull)
       return BM->addNullConstant(ObjectTy);
-    assert(Composite->getOpCode() == OpConstantComposite || Composite->getOpCode() == OpSpecConstantComposite);
-    Composite = static_cast<SPIRVConstantComposite *>(Composite)->getElements()[I];
+    if (Composite->getOpCode() == OpConstantComposite || Composite->getOpCode() == OpSpecConstantComposite) {
+      assert(Composite->getOpCode() == OpConstantComposite || Composite->getOpCode() == OpSpecConstantComposite);
+      Composite = static_cast<SPIRVConstantComposite *>(Composite)->getElements()[I];
+    } else if (Composite->getOpCode() == OpConstantCompositeReplicateEXT ||
+               Composite->getOpCode() == OpSpecConstantCompositeReplicateEXT) {
+      Composite = static_cast<SPIRVConstantComposite *>(Composite)->getElements()[0];
+    }
   }
 
   return Composite;
diff --git a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVInstruction.h b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVInstruction.h
index 5bd63faef8..dd4c1e2f34 100644
--- a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVInstruction.h
+++ b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVInstruction.h
@@ -1686,6 +1686,46 @@ class SPIRVCompositeConstruct : public SPIRVInstruction {
   std::vector<SPIRVId> Constituents;
 };
 
+class SPIRVCompositeConstructReplicateEXT : public SPIRVInstruction {
+public:
+  const static Op OC = OpCompositeConstructReplicateEXT;
+  const static SPIRVWord FixedWordCount = 3;
+  // Complete constructor
+  SPIRVCompositeConstructReplicateEXT(SPIRVType *TheType, SPIRVId TheId, const std::vector<SPIRVId> &TheConstituents,
+                                      SPIRVBasicBlock *TheBB)
+      : SPIRVInstruction(TheConstituents.size() + FixedWordCount, OC, TheType, TheId, TheBB),
+        Constituents(TheConstituents) {
+    validate();
+    assert(TheBB && "Invalid BB");
+  }
+
+  // Incomplete constructor
+  SPIRVCompositeConstructReplicateEXT() : SPIRVInstruction(OC) {}
+
+  const std::vector<SPIRVValue *> getConstituents() const { return getValues(Constituents); }
+
+protected:
+  void setWordCount(SPIRVWord TheWordCount) override {
+    SPIRVEntry::setWordCount(TheWordCount);
+    Constituents.resize(TheWordCount - FixedWordCount);
+  }
+  _SPIRV_DEF_DECODE3(Type, Id, Constituents)
+  void validate() const override {
+    SPIRVInstruction::validate();
+    switch (getValueType(this->getId())->getOpCode()) {
+    case OpTypeVector:
+    case OpTypeArray:
+    case OpTypeStruct:
+    case OpTypeMatrix:
+    case OpTypeCooperativeMatrixKHR:
+      break;
+    default:
+      static_assert("Invalid type", "");
+    }
+  }
+  std::vector<SPIRVId> Constituents;
+};
+
 class SPIRVCompositeExtract : public SPIRVInstruction {
 public:
   const static Op OC = OpCompositeExtract;
diff --git a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h
index 8e25023b0e..19b379af25 100644
--- a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h
+++ b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h
@@ -578,6 +578,7 @@ inline bool isValid(spv::Capability V) {
   case CapabilityFragmentShaderSampleInterlockEXT:
   case CapabilityFragmentShaderShadingRateInterlockEXT:
   case CapabilityFragmentShaderPixelInterlockEXT:
+  case CapabilityReplicatedCompositesEXT:
     return true;
   default:
     return false;
@@ -622,11 +623,13 @@ inline bool isValid(spv::Op V) {
   case OpConstantFalse:
   case OpConstant:
   case OpConstantComposite:
+  case OpConstantCompositeReplicateEXT:
   case OpConstantNull:
   case OpSpecConstantTrue:
   case OpSpecConstantFalse:
   case OpSpecConstant:
   case OpSpecConstantComposite:
+  case OpSpecConstantCompositeReplicateEXT:
   case OpSpecConstantOp:
   case OpFunction:
   case OpFunctionParameter:
@@ -652,6 +655,7 @@ inline bool isValid(spv::Op V) {
   case OpVectorInsertDynamic:
   case OpVectorShuffle:
   case OpCompositeConstruct:
+  case OpCompositeConstructReplicateEXT:
   case OpCompositeExtract:
   case OpCompositeInsert:
   case OpCopyObject:
@@ -968,6 +972,13 @@ inline bool isValidPackedVectorFormat(spv::PackedVectorFormat V) {
   }
 }
 
+inline bool isValidFPEncoding(spv::FPEncoding V) {
+  switch (V) {
+  default:
+    return false;
+  };
+}
+
 inline bool isValidImageOperandsMask(SPIRVWord Mask) {
   SPIRVWord ValidMask = 0u;
   ValidMask |= ImageOperandsBiasMask;
diff --git a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.cpp b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.cpp
index 8e408b42d8..90f6800874 100644
--- a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.cpp
+++ b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.cpp
@@ -228,6 +228,8 @@ class SPIRVModuleImpl : public SPIRVModule {
   SPIRVInstruction *addLoadInst(SPIRVValue *, const std::vector<SPIRVWord> &, SPIRVBasicBlock *) override;
   SPIRVInstruction *addPhiInst(SPIRVType *, std::vector<SPIRVValue *>, SPIRVBasicBlock *) override;
   SPIRVInstruction *addCompositeConstructInst(SPIRVType *, const std::vector<SPIRVId> &, SPIRVBasicBlock *) override;
+  SPIRVInstruction *addCompositeConstructReplicateEXTInst(SPIRVType *, const std::vector<SPIRVId> &,
+                                                          SPIRVBasicBlock *) override;
   SPIRVInstruction *addCompositeExtractInst(SPIRVType *, SPIRVValue *, const std::vector<SPIRVWord> &,
                                             SPIRVBasicBlock *) override;
   SPIRVInstruction *addCompositeInsertInst(SPIRVValue *Object, SPIRVValue *Composite,
@@ -1006,6 +1008,12 @@ SPIRVInstruction *SPIRVModuleImpl::addCompositeConstructInst(SPIRVType *Type, co
   return addInstruction(new SPIRVCompositeConstruct(Type, getId(), Constituents, BB), BB);
 }
 
+SPIRVInstruction *SPIRVModuleImpl::addCompositeConstructReplicateEXTInst(SPIRVType *Type,
+                                                                         const std::vector<SPIRVId> &Constituents,
+                                                                         SPIRVBasicBlock *BB) {
+  return addInstruction(new SPIRVCompositeConstructReplicateEXT(Type, getId(), Constituents, BB), BB);
+}
+
 SPIRVInstruction *SPIRVModuleImpl::addCompositeExtractInst(SPIRVType *Type, SPIRVValue *TheVector,
                                                            const std::vector<SPIRVWord> &Indices, SPIRVBasicBlock *BB) {
   return addInstruction(new SPIRVCompositeExtract(Type, getId(), TheVector, Indices, BB), BB);
diff --git a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.h b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.h
index 3f3a3c6835..c5f47125fb 100644
--- a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.h
+++ b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.h
@@ -229,6 +229,8 @@ class SPIRVModule {
   virtual void addCapabilityInternal(SPIRVCapabilityKind) = 0;
   virtual SPIRVInstruction *addCallInst(SPIRVFunction *, const std::vector<SPIRVWord> &, SPIRVBasicBlock *) = 0;
   virtual SPIRVInstruction *addCompositeConstructInst(SPIRVType *, const std::vector<SPIRVId> &, SPIRVBasicBlock *) = 0;
+  virtual SPIRVInstruction *addCompositeConstructReplicateEXTInst(SPIRVType *, const std::vector<SPIRVId> &,
+                                                                  SPIRVBasicBlock *) = 0;
   virtual SPIRVInstruction *addCompositeExtractInst(SPIRVType *, SPIRVValue *, const std::vector<SPIRVWord> &,
                                                     SPIRVBasicBlock *) = 0;
   virtual SPIRVInstruction *addCompositeInsertInst(SPIRVValue *, SPIRVValue *, const std::vector<SPIRVWord> &,
diff --git a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h
index 49e92c312e..ba195f5c27 100644
--- a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h
+++ b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h
@@ -521,6 +521,7 @@ template <> inline void SPIRVMap<Capability, std::string>::init() {
   add(CapabilityFragmentShaderSampleInterlockEXT, "FragmentShaderSampleInterlockEXT");
   add(CapabilityFragmentShaderShadingRateInterlockEXT, "FragmentShaderShadingRateInterlockEXT");
   add(CapabilityFragmentShaderPixelInterlockEXT, "FragmentShaderPixelInterlockEXT");
+  add(CapabilityReplicatedCompositesEXT, "ReplicatedCompositesEXT");
 }
 SPIRV_DEF_NAMEMAP(Capability, SPIRVCapabilityNameMap)
 
@@ -529,6 +530,10 @@ template <> inline void SPIRVMap<PackedVectorFormat, std::string>::init() {
 }
 SPIRV_DEF_NAMEMAP(PackedVectorFormat, SPIRVPackedVectorFormatNameMap);
 
+template <> inline void SPIRVMap<FPEncoding, std::string>::init() {
+}
+SPIRV_DEF_NAMEMAP(FPEncoding, SPIRVFPEncodingNameMap);
+
 } /* namespace SPIRV */
 
 #endif
diff --git a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVOpCodeEnum.h b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVOpCodeEnum.h
index 851aea834d..d990ab4411 100644
--- a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVOpCodeEnum.h
+++ b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVOpCodeEnum.h
@@ -298,6 +298,9 @@ _SPIRV_OP(SUDotAccSatKHR, 4455)
 _SPIRV_OP(TraceRayKHR, 4445)
 _SPIRV_OP(ExecuteCallableKHR, 4446)
 _SPIRV_OP(ConvertUToAccelerationStructureKHR, 4447)
+_SPIRV_OP(ConstantCompositeReplicateEXT, 4461)
+_SPIRV_OP(SpecConstantCompositeReplicateEXT, 4462)
+_SPIRV_OP(CompositeConstructReplicateEXT, 4463)
 _SPIRV_OP(TypeRayQueryKHR, 4472)
 _SPIRV_OP(RayQueryInitializeKHR, 4473)
 _SPIRV_OP(RayQueryTerminateKHR, 4474)
diff --git a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVType.cpp b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVType.cpp
index f172848ebf..d2370d990d 100644
--- a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVType.cpp
+++ b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVType.cpp
@@ -239,7 +239,7 @@ bool SPIRVType::isTypeBool() const {
 }
 
 bool SPIRVType::isTypeComposite() const {
-  return isTypeVector() || isTypeMatrix() || isTypeArray() || isTypeStruct();
+  return isTypeVector() || isTypeMatrix() || isTypeArray() || isTypeStruct() || isTypeCooperativeMatrixKHR();
 }
 
 bool SPIRVType::isTypeFloat(unsigned Bits) const {
@@ -300,6 +300,10 @@ bool SPIRVType::isTypeCooperativeMatrixKHR() const {
 
 void SPIRVTypeFloat::decode(std::istream &I) {
   getDecoder(I) >> (Id) >> (BitWidth);
+  if (WordCount > FixedWC)
+    getDecoder(I) >> (Encoding);
+  else
+    Encoding = FPEncodingMax;
 }
 
 bool SPIRVType::isTypeVectorBool() const {
diff --git a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVType.h b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVType.h
index 560395a3d2..cf89b5f8ee 100644
--- a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVType.h
+++ b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVType.h
@@ -186,7 +186,7 @@ class SPIRVTypeFloat : public SPIRVType {
     (void(Encoding)); // Unused
   }
   // Incomplete constructor
-  SPIRVTypeFloat() : SPIRVType(OC), BitWidth(0) {}
+  SPIRVTypeFloat() : SPIRVType(OC), BitWidth(0), Encoding(FPEncodingMax) {}
 
   unsigned getBitWidth() const { return BitWidth; }
 
@@ -194,7 +194,6 @@ class SPIRVTypeFloat : public SPIRVType {
     SPIRVCapVec CV;
     if (isTypeFloat(64))
       CV.push_back(CapabilityFloat64);
-
     return CV;
   }
 
@@ -202,7 +201,9 @@ class SPIRVTypeFloat : public SPIRVType {
   _SPIRV_DCL_DECODE
   void validate() const override {
     SPIRVEntry::validate();
-    assert(BitWidth >= 16 && BitWidth <= 64 && "Invalid bit width");
+    assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid bit width");
+    if (WordCount > FixedWC)
+      assert(isValidFPEncoding(static_cast<spv::FPEncoding>(Encoding)) && "Invalid Floating Point Encoding");
   }
 
 private:
diff --git a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVValue.h b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVValue.h
index e46543b454..7305b86c43 100644
--- a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVValue.h
+++ b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVValue.h
@@ -275,6 +275,11 @@ class SPIRVConstantComposite : public SPIRVValue {
   std::vector<SPIRVId> Elements;
 };
 
+class SPIRVConstantCompositeReplicateEXT : public SPIRVConstantComposite {
+public:
+  SPIRVConstantCompositeReplicateEXT() : SPIRVConstantComposite() { OpCode = OpConstantCompositeReplicateEXT; }
+};
+
 class SPIRVSpecConstantTrue : public SPIRVConstantBool<OpSpecConstantTrue> {
 public:
   SPIRVSpecConstantTrue() : SPIRVConstantBool() {}
@@ -295,6 +300,11 @@ class SPIRVSpecConstantComposite : public SPIRVConstantComposite {
   SPIRVSpecConstantComposite() : SPIRVConstantComposite() { OpCode = OpSpecConstantComposite; }
 };
 
+class SPIRVSpecConstantCompositeReplicateEXT : public SPIRVConstantComposite {
+public:
+  SPIRVSpecConstantCompositeReplicateEXT() : SPIRVConstantComposite() { OpCode = OpSpecConstantCompositeReplicateEXT; }
+};
+
 class SPIRVForward : public SPIRVValue, public SPIRVComponentExecutionModes {
 public:
   const static Op OC = OpForward;
diff --git a/llpc/unittests/context/testOptLevel.cpp b/llpc/unittests/context/testOptLevel.cpp
index 0c679bf298..dc65ab94ae 100644
--- a/llpc/unittests/context/testOptLevel.cpp
+++ b/llpc/unittests/context/testOptLevel.cpp
@@ -79,7 +79,7 @@ TEST(LlpcContextTests, MatchPipelineOptLevel) {
     ComputePipelineBuildInfo pipelineInfo = {};
     pipelineInfo.options.optimizationLevel = static_cast<uint32_t>(optLevel);
 
-    ComputeContext computeContext(GfxIp, "Vulkan", &pipelineInfo, &pipelineHash, &cacheHash);
+    ComputeContext computeContext(GfxIp, "Vulkan", &pipelineInfo, StringRef(""), &pipelineHash, &cacheHash);
 
     context.attachPipelineContext(&computeContext);
 
diff --git a/llpc/util/llpcElfWriter.cpp b/llpc/util/llpcElfWriter.cpp
index 00d754f0a5..c70c7c7905 100644
--- a/llpc/util/llpcElfWriter.cpp
+++ b/llpc/util/llpcElfWriter.cpp
@@ -358,6 +358,7 @@ void ElfWriter<Elf>::mergeMetaNote(Context *pContext, const ElfNote *pNote1, con
         PalAbi::GraphicsRegisterMetadataKey::CbShaderMask,
         PalAbi::GraphicsRegisterMetadataKey::AaCoverageToShaderSelect,
         PalAbi::GraphicsRegisterMetadataKey::PsLoadProvokingVtx,
+        PalAbi::GraphicsRegisterMetadataKey::PsExtraLdsSize,
     };
     auto destRegisters = destPipeline.getMap(true)[PalAbi::PipelineMetadataKey::GraphicsRegisters].getMap(true);
     auto srcRegisters = srcPipeline.getMap(true)[PalAbi::PipelineMetadataKey::GraphicsRegisters].getMap(true);
diff --git a/llpc/util/llpcError.cpp b/llpc/util/llpcError.cpp
index adeafee8a9..07bc31e82c 100644
--- a/llpc/util/llpcError.cpp
+++ b/llpc/util/llpcError.cpp
@@ -114,12 +114,13 @@ std::error_code resultToErrorCode(Result result) {
 // Prints the error message in `err` to LLPC_ERRS and consumes the error.
 //
 // @param err : The error to handle. This must not be an `ErrorSuccess`.
-// @returns: The underlying `Result` when `err` is a `ResultError`, `Result::ErrorUnknown` otherwise.
-Result reportError(Error &&err) {
+// @param defaultErrorResult : The Result code to use for unknown error kinds.
+// @returns: The underlying `Result` when `err` is a `ResultError`, defaultErrorResult otherwise.
+Result reportError(Error &&err, Result defaultErrorResult) {
   // For details on llvm error handling, see https://llvm.org/docs/ProgrammersManual.html#recoverable-errors.
   assert(err && "llvm::ErrorSuccess is not an error");
 
-  Result result = Result::ErrorUnknown;
+  Result result = defaultErrorResult;
   handleAllErrors(
       std::move(err),
       [&result](const ResultError &resultError) {
diff --git a/llpc/util/llpcError.h b/llpc/util/llpcError.h
index 7e854d54ab..b2c0412631 100644
--- a/llpc/util/llpcError.h
+++ b/llpc/util/llpcError.h
@@ -42,7 +42,8 @@ void mustSucceed(Vkgc::Result result, const llvm::Twine &errorMessage = {});
 
 // Prints the error to LLPC_ERRS and consumes it. Returns the underlying `Result` when the error is a `ResultError`, or
 // `Result::ErrorUnknown` otherwise.
-LLPC_NODISCARD Vkgc::Result reportError(llvm::Error &&err);
+LLPC_NODISCARD Vkgc::Result reportError(llvm::Error &&err,
+                                        Vkgc::Result defaultErrorResult = Vkgc::Result::ErrorUnknown);
 
 // Converts a `Vkgc::Result` to `std::error_code` with a custom error category.
 LLPC_NODISCARD std::error_code resultToErrorCode(Vkgc::Result result);
@@ -120,6 +121,12 @@ inline llvm::Error createResultError(Vkgc::Result result, const llvm::Twine &err
   return llvm::make_error<ResultError>(result, errorMessage);
 }
 
+inline llvm::Error resultToError(Vkgc::Result result, const llvm::Twine &errorMessage = {}) {
+  if (result == Vkgc::Result::Success)
+    return llvm::Error::success();
+  return createResultError(result, errorMessage);
+}
+
 // Extracts the `Result` value from the given `ResultError`. Assumes that `err` is either a `ResultError` or
 // `llvm::ErrorSuccess`.
 LLPC_NODISCARD Vkgc::Result errorToResult(llvm::Error &&err);
diff --git a/llpc/util/llpcThreading.cpp b/llpc/util/llpcThreading.cpp
new file mode 100644
index 0000000000..03e62b500b
--- /dev/null
+++ b/llpc/util/llpcThreading.cpp
@@ -0,0 +1,255 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2016-2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+#include "llpcThreading.h"
+#include "llpc.h"
+#include <condition_variable>
+
+using namespace llvm;
+using namespace Llpc;
+
+namespace {
+
+// =====================================================================================================================
+// Limited implementation of Llpc::IHelperThreadProvider to support extra threads when no helper thread provider is
+// given.
+class InternalHelperThreadProvider : public Llpc::IHelperThreadProvider {
+public:
+  virtual void SetTasks(ThreadFunction *, uint32_t numTasks, void *) override {
+    assert(!m_totalInstances && "InternalHelperThreadProvider is single use");
+    m_totalInstances = numTasks;
+  }
+
+  virtual bool GetNextTask(uint32_t *pTaskIndex) override {
+    assert(pTaskIndex != nullptr);
+    *pTaskIndex = m_nextInstance.fetch_add(1);
+    return (*pTaskIndex < m_totalInstances);
+  }
+
+  virtual void TaskCompleted() override {
+    uint32_t completedInstances = m_completedInstances.fetch_add(1) + 1;
+    if (completedInstances == m_totalInstances)
+      m_event.notify_all();
+  }
+
+  virtual void WaitForTasks() override {
+    std::unique_lock<std::mutex> lock(m_lock);
+    while (m_completedInstances < m_totalInstances)
+      m_event.wait(lock);
+  }
+
+private:
+  uint32_t m_totalInstances = 0;
+  std::atomic<uint32_t> m_nextInstance = 0;
+  std::atomic<uint32_t> m_completedInstances = 0;
+  std::condition_variable m_event;
+  std::mutex m_lock;
+};
+
+struct ParallelForWithContextState {
+  std::atomic<bool> helperThreadJoined = false;
+  std::atomic<bool> mainThreadUnlocked = false;
+  std::mutex mutex;
+  std::condition_variable_any cvar;
+  std::atomic<bool> haveError = false;
+  Error error = Error::success();
+  HelperThreadExclusion helperThreadExclusion = HelperThreadExclusion::CreateContext;
+  function_ref<void *()> createContext;
+  function_ref<Error(size_t, void *)> taskFunction;
+  function_ref<void(void *)> destroyContext;
+
+  bool recordError(Error err) {
+    // Record only the first error, ignore all subsequent ones.
+    if (!haveError.exchange(true, std::memory_order_relaxed)) {
+      // We have exclusive access here because
+      //  1. the atomic exchange ensures that only one thread ever executes this assignment
+      //  2. the error is read by the main thread only after waiting for all tasks to complete, and we only signal
+      //     completion of the failed task after recording the error
+      // The second point is also required to justify using a relaxed atomic for the exchange.
+      error = std::move(err);
+      return true;
+    } else {
+      consumeError(std::move(err));
+      return false;
+    }
+  }
+
+  // Returns true if all tasks are known to be completed or about to be completed by another thread.
+  bool runInnerLoop(IHelperThreadProvider *helperThreadProvider, void *context, unsigned firstIndex,
+                    function_ref<bool()> shouldBreak = {}) {
+    unsigned taskIndex = firstIndex;
+    do {
+      bool error = false;
+      bool recordedError = false;
+
+      if (Error err = taskFunction(taskIndex, context)) {
+        error = true;
+        recordedError = recordError(std::move(err));
+      }
+
+      // Subtle: signaling completion must happen after recording an error.
+      helperThreadProvider->TaskCompleted();
+
+      if (recordedError) {
+        // Drain all remaining tasks from a single thread when an error occurs.
+        while (helperThreadProvider->GetNextTask(&taskIndex))
+          helperThreadProvider->TaskCompleted();
+      }
+
+      if (error)
+        return true; // either we just drained everything or somebody else does so concurrently
+
+      if (shouldBreak && shouldBreak())
+        return false;
+
+      if (haveError.load(std::memory_order_relaxed)) {
+        // Some other thread encountered an error and is draining all remaining tasks concurrently.
+        return true;
+      }
+    } while (helperThreadProvider->GetNextTask(&taskIndex));
+
+    return true;
+  }
+
+  // Entry point for helper thread that join the parallel for.
+  static void runHelperThread(IHelperThreadProvider *helperThreadProvider, void *data) {
+    ParallelForWithContextState *state = static_cast<ParallelForWithContextState *>(data);
+
+    // Pre-load the flag to avoid dirtying shared data.
+    if (!state->helperThreadJoined.load(std::memory_order_relaxed))
+      state->helperThreadJoined.store(true, std::memory_order_relaxed);
+
+    unsigned taskIndex;
+    if (!helperThreadProvider->GetNextTask(&taskIndex))
+      return;
+
+    void *context = nullptr;
+
+    if (state->helperThreadExclusion != HelperThreadExclusion::CreateContext) {
+      // Create the context early if allowed so that we spend less time waiting for the main thread to unlock us.
+      context = state->createContext();
+    }
+
+    if (state->helperThreadExclusion != HelperThreadExclusion::None &&
+        !state->mainThreadUnlocked.load(std::memory_order_acquire)) {
+      std::unique_lock<std::mutex> lock(state->mutex);
+      state->cvar.wait(state->mutex, [state]() { return state->mainThreadUnlocked.load(std::memory_order_acquire); });
+    }
+
+    if (!context)
+      context = state->createContext();
+
+    state->runInnerLoop(helperThreadProvider, context, taskIndex);
+    state->destroyContext(context);
+  };
+};
+
+} // anonymous namespace
+
+Error Llpc::detail::parallelForWithContextImpl(size_t numExtraThreads, IHelperThreadProvider *helperThreadProvider,
+                                               size_t numTasks, HelperThreadExclusion helperThreadExclusion,
+                                               function_ref<void *()> createContext,
+                                               function_ref<Error(size_t, void *)> taskFunction,
+                                               function_ref<void(void *)> destroyContext) {
+  if (!numTasks)
+    return Error::success();
+
+  InternalHelperThreadProvider ourHelperThreadProvider;
+  if (numExtraThreads && !helperThreadProvider)
+    helperThreadProvider = &ourHelperThreadProvider;
+
+  if (!helperThreadProvider) {
+    for (size_t i = 0; i < numTasks; ++i) {
+      if (Error err = taskFunction(i, nullptr))
+        return err;
+    }
+    return Error::success();
+  }
+
+  ParallelForWithContextState state;
+  state.helperThreadExclusion = helperThreadExclusion;
+  state.createContext = createContext;
+  state.taskFunction = taskFunction;
+  state.destroyContext = destroyContext;
+
+  // If we have extra threads, assume that they join immediately so that we never give the exclusive lock to the main
+  // thread.
+  if (numExtraThreads) {
+    state.helperThreadJoined.store(true, std::memory_order_relaxed);
+    state.mainThreadUnlocked.store(true, std::memory_order_relaxed);
+  }
+
+  // This is implicitly a release fence. Helper threads may be executing from this point on.
+  helperThreadProvider->SetTasks(&ParallelForWithContextState::runHelperThread, numTasks, &state);
+
+  std::vector<std::thread> workers(numExtraThreads);
+  for (std::thread &worker : workers) {
+    worker = std::thread(
+        [helperThreadProvider, &state] { ParallelForWithContextState::runHelperThread(helperThreadProvider, &state); });
+  }
+
+  unsigned taskIndex;
+  if (!helperThreadProvider->GetNextTask(&taskIndex)) {
+    // This can happen if a helper thread races us.
+  } else {
+    if (helperThreadExclusion == HelperThreadExclusion::None) {
+      state.runInnerLoop(helperThreadProvider, nullptr, taskIndex);
+    } else {
+      bool drained = false;
+
+      if (!numExtraThreads) {
+        // If we don't spawn additional threads ourselves, we rely on threads from the provider. There is no guarantee
+        // that other threads will arrive soon or at all, so run without a context on the main thread first. This avoids
+        // the cost of running with a context if it later turns out to have been unnecessary.
+        drained = state.runInnerLoop(helperThreadProvider, nullptr, taskIndex,
+                                     [&state] { return state.helperThreadJoined.load(std::memory_order_relaxed); });
+        if (!drained)
+          drained = !helperThreadProvider->GetNextTask(&taskIndex);
+
+        // The release pairs with the acquire in the helper thread. The point of this synchronization is to synchronize
+        // the structures the caller has which require the helper thread exclusion. (We need it at least for the acquire
+        // that happens outside of the mutex.)
+        //
+        // Note that if we have extra threads, we start in the unlocked state and don't have to notify the condition
+        // variable.
+        state.mainThreadUnlocked.store(true, std::memory_order_release);
+        state.cvar.notify_all();
+      }
+
+      if (!drained) {
+        void *context = state.createContext();
+        state.runInnerLoop(helperThreadProvider, context, taskIndex);
+        state.destroyContext(context);
+      }
+    }
+  }
+
+  helperThreadProvider->WaitForTasks();
+
+  for (std::thread &worker : workers)
+    worker.join();
+
+  return std::move(state.error);
+}
diff --git a/llpc/util/llpcThreading.h b/llpc/util/llpcThreading.h
index 62224debe8..507206ae1e 100644
--- a/llpc/util/llpcThreading.h
+++ b/llpc/util/llpcThreading.h
@@ -1,13 +1,13 @@
 /*
  ***********************************************************************************************************************
  *
- *  Copyright (c) 2021 Google LLC. All Rights Reserved.
+ *  Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All Rights Reserved.
  *
  *  Permission is hereby granted, free of charge, to any person obtaining a copy
- *  of this software and associated documentation files (the "Software"), to deal
- *  in the Software without restriction, including without limitation the rights
- *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- *  copies of the Software, and to permit persons to whom the Software is
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
  *  furnished to do so, subject to the following conditions:
  *
  *  The above copyright notice and this permission notice shall be included in all
@@ -17,9 +17,9 @@
  *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- *  SOFTWARE.
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
  *
  **********************************************************************************************************************/
 /**
@@ -31,6 +31,7 @@
 #pragma once
 
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/Support/Error.h"
 #include <atomic>
 #include <mutex>
@@ -39,6 +40,22 @@
 
 namespace Llpc {
 
+class IHelperThreadProvider;
+
+/// The level of exclusion that is required for helper threads in @ref parallelForWithContext.
+enum class HelperThreadExclusion {
+  // No exclusion necessary.
+  None,
+
+  // The main thread must no longer be running with context == nullptr when the taskFunction is called for a helper
+  // thread.
+  Task,
+
+  // In addition to @ref Task, the main thread also must not be running with context == nullptr when the createContext
+  // function is called.
+  CreateContext,
+};
+
 namespace detail {
 // =====================================================================================================================
 // Decides how many concurrent threads to use, taking into the requested number of threads, the number of tasks
@@ -62,8 +79,50 @@ inline unsigned decideNumConcurrentThreads(size_t numThreadsRequested, size_t nu
 
   return std::min(numThreadsRequested, numTasks);
 }
+
+llvm::Error parallelForWithContextImpl(size_t numExtraThreads, IHelperThreadProvider *helperThreadProvider,
+                                       size_t numTasks, HelperThreadExclusion helperThreadExclusion,
+                                       llvm::function_ref<void *()> createContext,
+                                       llvm::function_ref<llvm::Error(size_t, void *)> taskFunction,
+                                       llvm::function_ref<void(void *)> destroyContext);
 } // namespace detail
 
+// A parallel for loop using an optional IHelperThreadProvider and a given number of extra threads that are created
+// on-the-fly.
+//
+// This function is designed for tasks where the helper threads require some context that is expensive to set up and/or
+// running on a helper thread is less efficient, and the context can be re-used across individual tasks.
+//
+// The taskFunction is called for each task (as long as no error is encountered), with a context created by
+// createContext as an argument or null if the task is called on the main thread (the thread calling
+// parallelForWithContext).
+//
+// Set helperExclusion to a value other than @ref HelperThreadExclusion::None if helper tasks cannot run while
+// taskFunction is running with a null context on the main thread. In that case, as soon as helper threads join,
+// the main thread will create its own context to run subsequent tasks with.
+//
+// Returns the first error that was returned by taskFunction. Once an error is encountered, subsequent tasks may be
+// skipped.
+template <typename ContextT>
+llvm::Error parallelForWithContext(size_t numExtraThreads, IHelperThreadProvider *helperThreadProvider, size_t numTasks,
+                                   HelperThreadExclusion helperThreadExclusion,
+                                   llvm::function_ref<std::unique_ptr<ContextT>()> createContext,
+                                   llvm::function_ref<llvm::Error(size_t, ContextT *)> taskFunction,
+                                   llvm::function_ref<void(std::unique_ptr<ContextT>)> destroyContext) {
+  // Forward to a type-erased implementation. The type erasure costs a heap allocation of the context (instead of a
+  // stack allocation on the helper thread stack), but the premise is that the context is expensive to create anyway.
+  return ::Llpc::detail::parallelForWithContextImpl(
+      numExtraThreads, helperThreadProvider, numTasks, helperThreadExclusion,
+      [createContext]() -> void * {
+        std::unique_ptr<ContextT> ctx = createContext();
+        return ctx.release();
+      },
+      [taskFunction](size_t idx, void *context) -> llvm::Error {
+        return taskFunction(idx, static_cast<ContextT *>(context));
+      },
+      [destroyContext](void *context) { destroyContext(std::unique_ptr<ContextT>(static_cast<ContextT *>(context))); });
+}
+
 // =====================================================================================================================
 // A parallel for loop implementation using a simple worker thread pool. Unlike `llvm::parallel*` algorithms, does not
 // depend on a global thread pool strategy.
diff --git a/llvmraytracing/CMakeLists.txt b/llvmraytracing/CMakeLists.txt
index 59693457a6..b532314daa 100644
--- a/llvmraytracing/CMakeLists.txt
+++ b/llvmraytracing/CMakeLists.txt
@@ -1,3 +1,28 @@
+##
+ #######################################################################################################################
+ #
+ #  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ #
+ #  Permission is hereby granted, free of charge, to any person obtaining a copy
+ #  of this software and associated documentation files (the "Software"), to
+ #  deal in the Software without restriction, including without limitation the
+ #  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ #  sell copies of the Software, and to permit persons to whom the Software is
+ #  furnished to do so, subject to the following conditions:
+ #
+ #  The above copyright notice and this permission notice shall be included in all
+ #  copies or substantial portions of the Software.
+ #
+ #  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ #  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ #  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ #  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ #  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ #  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ #  IN THE SOFTWARE.
+ #
+ #######################################################################################################################
+
 cmake_minimum_required(VERSION 3.13.4)
 
 project(LlvmRaytracing LANGUAGES CXX)
@@ -15,6 +40,7 @@ option(LLVMRAYTRACING_BUILD_TESTS "Build raytracing tests")
 
 add_llvm_library(LLVMRaytracing
   lib/CleanupContinuations.cpp
+  lib/ContStateBuilder.cpp
   lib/Continuations.cpp
   lib/ContinuationsLint.cpp
   lib/CpsStackLowering.cpp
diff --git a/llvmraytracing/README.md b/llvmraytracing/README.md
index 0ce2cbed6e..8a92b338dd 100644
--- a/llvmraytracing/README.md
+++ b/llvmraytracing/README.md
@@ -1,10 +1,19 @@
-# Continuations
+# llvmraytracing
 
-A collection of passes to convert shaders to coroutines.
-Some passes work on DXIL, some passes are generic.
+A library to implement ray tracing pipeline compilation using coroutine transforms.
 
-This is supposed to be used as a submodule in a driver repository.
+## Details
 
-### Tests
+Ray tracing shaders are expected in the `lgc.rt` dialect.
 
-Lit tests are behind the `check-llvmraytracing` CMake target, they can be run with `make check-llvmraytracing`.
+We first lower ray tracing, using the GPURT library which provides implementations for intrinsics to be inlined, and
+standalone driver shaders such as the Traversal shader. The result is in `lgc.cps` dialect form, using await and enqueue
+operations to represent indirect function calls, indirect tail calls, and returns.
+
+Then, we use the LLVM coroutine infrastructure for lowering awaits, splitting functions and introducing resume functions.
+
+At the end, we apply some cleanups to prepare IR for the backend.
+
+## Tests
+
+Lit tests are behind the `check-llvmraytracing` CMake target.
diff --git a/llvmraytracing/include/lgc/LgcCpsDialect.td b/llvmraytracing/include/lgc/LgcCpsDialect.td
index 0d63a9c9ca..c6df4411a4 100644
--- a/llvmraytracing/include/lgc/LgcCpsDialect.td
+++ b/llvmraytracing/include/lgc/LgcCpsDialect.td
@@ -43,7 +43,7 @@ def ContinuationReference : TgConstant<(I32)>, Type;
 
 // =====================================================================================================================
 def JumpOp : LgcCpsOp<"jump", [NoReturn]> {
-    let arguments = (ins ContinuationReference:$target, AttrI32:$levels, I32:$csp, ContinuationReference:$rcr, varargs:$tail);
+    let arguments = (ins ContinuationReference:$target, AttrI32:$levels, I32:$csp, I32:$shaderIndex, ContinuationReference:$rcr,varargs:$tail);
     let results = (outs);
 
     let summary = "Jump to a CPS function.";
@@ -52,6 +52,8 @@ def JumpOp : LgcCpsOp<"jump", [NoReturn]> {
             * target, the continuation reference
             * levels, a bitmask of levels in which target may run
             * csp, continuation stack pointer,
+            * shader index, the shader binding table index. This index was used to lookup the
+              target shader, if it is the entry function of an app shader and poison otherwise.
             * rcr, a continuation reference the called function can potentially return to
             * an arbitrary set of arguments appended to the tail of the argument list.
     }];
diff --git a/llvmraytracing/include/lgc/LgcIlCpsDialect.td b/llvmraytracing/include/lgc/LgcIlCpsDialect.td
index 05664270e6..06a5c49072 100644
--- a/llvmraytracing/include/lgc/LgcIlCpsDialect.td
+++ b/llvmraytracing/include/lgc/LgcIlCpsDialect.td
@@ -53,8 +53,24 @@ def GetReturnValueOp : LgcIlCpsOp<"getReturnValue", [NoUnwind, WillReturn]> {
   }];
 }
 
+def SetLocalRootIndexOp : LgcIlCpsOp<"setLocalRootIndex", [WillReturn]> {
+  let arguments = (ins I32:$localRootIndex);
+  let results = (outs);
+
+  let summary =
+    "Sets the local root signature for the current shader";
+
+  let description = [{
+    This op accepts an i32, which is the local root signature index. This is
+    used to ensure that the local root index gets properly set before any operations
+    that depend on it. The arguments are:
+
+      - localRootIndex: the local root signature index
+  }];
+}
+
 def ContinueOp : LgcIlCpsOp<"continue", [NoReturn]> {
-  let arguments = (ins I64:$shaderAddr, I32:$csp, I32:$returnAddr, varargs:$tail);
+  let arguments = (ins I64:$shaderAddr, I32:$csp, I32:$shaderIndex, I32:$returnAddr, varargs:$tail);
   let results = (outs);
 
   let summary =
@@ -66,6 +82,8 @@ def ContinueOp : LgcIlCpsOp<"continue", [NoReturn]> {
       - shaderAddr, the shader the current shader should jump to
       - csp, the continuation stack pointer. Whatever is passed here, is going to be
         overridden by the compiler.
+      - shader index, the shader binding table index. This index was used to lookup the
+        target shader, if it is the entry function of an app shader and poison otherwise.
       - returnAddr, the return address the called shader should jump back to, e. g. the
         resume function.
       - tail, a set of arguments like the system data or hit attributes.
@@ -73,7 +91,7 @@ def ContinueOp : LgcIlCpsOp<"continue", [NoReturn]> {
 }
 
 def WaitContinueOp : LgcIlCpsOp<"waitContinue", [NoReturn]> {
-  let arguments = (ins I64:$shaderAddr, I64:$waitMask, I32:$csp, I32:$returnAddr, varargs:$tail);
+  let arguments = (ins I64:$shaderAddr, I64:$waitMask, I32:$csp, I32:$shaderIndex, I32:$returnAddr, varargs:$tail);
   let results = (outs);
 
   let summary =
@@ -86,6 +104,8 @@ def WaitContinueOp : LgcIlCpsOp<"waitContinue", [NoReturn]> {
       - waitMask, the bitmask all lanes have to wait for.
       - csp, the continuation stack pointer. Whatever is passed here, is going to be
         overridden by the compiler.
+      - shader index, the shader binding table index. This index was used to lookup the
+        target shader, if it is the entry function of an app shader and poison otherwise.
       - returnAddr, the return address the called shader should jump back to, e. g. the
         resume function.
       - tail, a set of arguments like the system data or hit attributes.
diff --git a/llvmraytracing/include/llvmraytracing/Continuations.h b/llvmraytracing/include/llvmraytracing/Continuations.h
index 65f770046e..c2caca781c 100644
--- a/llvmraytracing/include/llvmraytracing/Continuations.h
+++ b/llvmraytracing/include/llvmraytracing/Continuations.h
@@ -106,9 +106,6 @@ std::optional<PAQShaderStage> rtShaderStageToPAQShaderStage(lgc::rt::RayTracingS
 /// Returns true if something changed.
 bool fixupDxilMetadata(Module &M);
 
-/// Get intrinsic to set the local root signature index.
-Function *getSetLocalRootIndex(Module &M);
-
 /// Get intrinsic to convert a dx handle to an acceleration struct address.
 Function *getAccelStructAddr(Module &M, Type *HandleTy);
 
@@ -250,7 +247,7 @@ bool DXILMaterializable(Instruction &I);
 // coro-split)
 class DXILCoroSplitPass : public CoroSplitPass {
 public:
-  DXILCoroSplitPass() : CoroSplitPass(std::function<bool(Instruction &)>(&DXILMaterializable), true) {}
+  DXILCoroSplitPass();
 
   static llvm::StringRef name() { return "DXIL continuations coro split pass wrapper"; }
 };
@@ -267,7 +264,7 @@ bool LgcMaterializable(Instruction &I);
 // coro-split)
 class LgcCoroSplitPass : public CoroSplitPass {
 public:
-  LgcCoroSplitPass() : CoroSplitPass(std::function<bool(Instruction &)>(&LgcMaterializable), true) {}
+  LgcCoroSplitPass();
 
   static llvm::StringRef name() { return "Lgc continuations coro split pass wrapper"; }
 };
diff --git a/llvmraytracing/include/llvmraytracing/ContinuationsDialect.h b/llvmraytracing/include/llvmraytracing/ContinuationsDialect.h
index d1ca5f0959..b5b4bae1c3 100644
--- a/llvmraytracing/include/llvmraytracing/ContinuationsDialect.h
+++ b/llvmraytracing/include/llvmraytracing/ContinuationsDialect.h
@@ -1,3 +1,28 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
 // Transition header -- to be removed
 #include "lgc/LgcIlCpsDialect.h"
 
diff --git a/llvmraytracing/include/llvmraytracing/ContinuationsUtil.h b/llvmraytracing/include/llvmraytracing/ContinuationsUtil.h
index 3d95f02157..fff75a0904 100644
--- a/llvmraytracing/include/llvmraytracing/ContinuationsUtil.h
+++ b/llvmraytracing/include/llvmraytracing/ContinuationsUtil.h
@@ -86,8 +86,8 @@ const unsigned GlobalMaxHitAttributeBytes = 32;
 const unsigned MinimumContinuationStateBytes = 8;
 
 struct CpsArgIdx {
-  static constexpr uint32_t ReturnAddr = 0;
-  static constexpr uint32_t ShaderIndex = 1;
+  static constexpr uint32_t ShaderIndex = 0;
+  static constexpr uint32_t ReturnAddr = 1;
   static constexpr uint32_t SystemData = 2;
   static constexpr uint32_t HitAttributes = 3;
   static constexpr uint32_t Padding = 4;
@@ -95,8 +95,8 @@ struct CpsArgIdx {
 };
 
 struct CpsArgIdxWithStackPtr {
-  static constexpr uint32_t ReturnAddr = 0;
-  static constexpr uint32_t ShaderIndex = 1;
+  static constexpr uint32_t ShaderIndex = 0;
+  static constexpr uint32_t ReturnAddr = 1;
   static constexpr uint32_t CspInit = 2;
   static constexpr uint32_t SystemData = 3;
   static constexpr uint32_t HitAttributes = 4;
@@ -428,8 +428,6 @@ class ContHelper {
 
   static void removeWaitMask(CallInst &CI) { CI.setMetadata(MDWaitMaskName, nullptr); }
 
-  static bool isLgcCpsModule(Module &Mod) { return Mod.getNamedMetadata(MDLgcCpsModuleName) != nullptr; }
-
   /// Returns true if a call to the given function should be rematerialized
   /// in a shader of the specified kind.
   ///
@@ -528,6 +526,7 @@ class ShaderStageHelper final {
 
 namespace ContDriverFunc {
 #define DRIVER_FUNC_NAME(KEY) constexpr const char *KEY##Name = "_cont_" #KEY;
+DRIVER_FUNC_NAME(DispatchRaysIndex3)
 DRIVER_FUNC_NAME(GetContinuationStackGlobalMemBase)
 DRIVER_FUNC_NAME(GetTriangleHitAttributes)
 DRIVER_FUNC_NAME(SetTriangleHitAttributes)
@@ -536,8 +535,6 @@ DRIVER_FUNC_NAME(GetCommittedState)
 DRIVER_FUNC_NAME(GetContinuationStackAddr)
 DRIVER_FUNC_NAME(ExitRayGen)
 DRIVER_FUNC_NAME(IsEndSearch)
-DRIVER_FUNC_NAME(GetLocalRootIndex)
-DRIVER_FUNC_NAME(SetLocalRootIndex)
 DRIVER_FUNC_NAME(TraceRay)
 DRIVER_FUNC_NAME(CallShader)
 DRIVER_FUNC_NAME(ReportHit)
diff --git a/llvmraytracing/include/llvmraytracing/CpsStackLowering.h b/llvmraytracing/include/llvmraytracing/CpsStackLowering.h
index 16e0ab65e1..e9826e50b9 100644
--- a/llvmraytracing/include/llvmraytracing/CpsStackLowering.h
+++ b/llvmraytracing/include/llvmraytracing/CpsStackLowering.h
@@ -93,7 +93,7 @@ class CpsStackLowering {
   void visitStore(llvm::StoreInst &);
   void visitContinue(lgc::ilcps::ContinueOp &);
   void visitWaitContinue(lgc::ilcps::WaitContinueOp &);
-  llvm::Value *getRealMemoryAddress(llvm::IRBuilder<> &, llvm::Value *);
+  llvm::Value *getRealMemoryAddress(llvm::Value *);
   llvm::Function *addOrInitCsp(llvm::Function *F, llvm::Function *GetGlobalMemBase, bool RequiresIncomingCsp);
 
   // Register a base pointer in the CpsStackLowering.
@@ -106,7 +106,7 @@ class CpsStackLowering {
   // offset.
   void setRealBasePointer(llvm::Value *BasePointer) { this->BasePointer = BasePointer; }
 
-  llvm::Value *loadCsp(llvm::IRBuilder<> &Builder);
+  llvm::Value *loadCsp();
 
   llvm::Module *Mod;
   llvm::AllocaInst *CpsStackAlloca = nullptr;
diff --git a/llvmraytracing/include/llvmraytracing/PipelineState.h b/llvmraytracing/include/llvmraytracing/PipelineState.h
index ba0a2fc570..a9fc7fbb55 100644
--- a/llvmraytracing/include/llvmraytracing/PipelineState.h
+++ b/llvmraytracing/include/llvmraytracing/PipelineState.h
@@ -58,6 +58,7 @@
 
 namespace llvm {
 class Module;
+class raw_ostream;
 namespace msgpack {
 class DocNode;
 } // namespace msgpack
@@ -82,6 +83,11 @@ class PipelineState {
 
   void merge(const PipelineState &Other);
 
+  void print(llvm::raw_ostream &OS) const;
+#ifndef NDEBUG
+  void dump() const;
+#endif
+
 private:
   // Actual state is intentionally private, as this interface is intended to be used like opaque state.
   // llvmraytracing passes don't use this interface, and instead directly work on module metadata.
diff --git a/llvmraytracing/include/llvmraytracing/SpecializeDriverShaders.h b/llvmraytracing/include/llvmraytracing/SpecializeDriverShaders.h
index 04a62ba405..3d4c4bc27a 100644
--- a/llvmraytracing/include/llvmraytracing/SpecializeDriverShaders.h
+++ b/llvmraytracing/include/llvmraytracing/SpecializeDriverShaders.h
@@ -166,6 +166,8 @@ class SpecializeDriverShadersState {
 
   void merge(const Self &Other);
 
+  void print(llvm::raw_ostream &OS) const;
+
 private:
   friend class SpecializeDriverShadersPass;
 
diff --git a/llvmraytracing/lib/CleanupContinuations.cpp b/llvmraytracing/lib/CleanupContinuations.cpp
index c58c2184b7..33bfd756d8 100644
--- a/llvmraytracing/lib/CleanupContinuations.cpp
+++ b/llvmraytracing/lib/CleanupContinuations.cpp
@@ -123,7 +123,6 @@ class CleanupContinuationsPassImpl {
   Function *ContMalloc = nullptr;
   Function *ContFree = nullptr;
   MapVector<Function *, ContinuationData> ToProcess;
-  uint32_t MaxContStateBytes;
   llvm::Module *GpurtLibrary = nullptr;
   std::optional<CpsStackLowering> StackLowering;
   Function *GetGlobalMemBase = nullptr;
@@ -237,8 +236,6 @@ void CleanupContinuationsPassImpl::analyzeContinuation(Function &F, MDNode *MD)
   if (Data.MallocCall) {
     Data.ContStateBytes = cast<ConstantInt>(Data.MallocCall->getArgOperand(0))->getSExtValue();
   }
-  if (Data.ContStateBytes > MaxContStateBytes)
-    MaxContStateBytes = Data.ContStateBytes;
 }
 
 void CleanupContinuationsPassImpl::updateCpsStack(Function *F, Function *NewFunc, bool IsStart,
@@ -258,7 +255,7 @@ void CleanupContinuationsPassImpl::updateCpsStack(Function *F, Function *NewFunc
   Value *ContFrame = getContinuationFramePtr(F, IsStart, CpsInfo, &ToBeRemoved);
 
   if (CpsInfo.ContStateBytes != 0) {
-    CompilerUtils::replaceAllPointerUses(&Builder, ContFrame, CpsStack, ToBeRemoved);
+    CompilerUtils::replaceAllPointerUses(ContFrame, CpsStack, ToBeRemoved);
   } else {
     // If there is no continuation state, replace it with a poison
     // value instead of a zero-sized stack allocation.
@@ -284,7 +281,6 @@ static void buildArgInfos(Function *F, bool IsStart, SmallVector<Type *> &AllArg
                           SmallVector<Value *> &AllArgValues, uint32_t &StartReturnArg,
                           SmallVector<AttributeSet> &ParamAttrs, SmallVector<Instruction *> &InstsToRemove,
                           SmallVector<lgc::ilcps::GetReturnValueOp *> &ReturnValueOps) {
-  auto &Context = F->getContext();
   AttributeList FAttrs = F->getAttributes();
   if (IsStart) {
     unsigned ArgNo = 0;
@@ -298,16 +294,6 @@ static void buildArgInfos(Function *F, bool IsStart, SmallVector<Type *> &AllArg
       ArgNo++;
     }
   } else {
-    // Add i32 %rcr here
-    Type *I32 = Type::getInt32Ty(Context);
-    AllArgTypes.push_back(I32);
-    AllArgValues.push_back(nullptr);
-    if (lgc::cps::isCpsFunction(*F)) {
-      //  Add i32 %shader-index for resume part.
-      AllArgTypes.push_back(I32);
-      AllArgValues.push_back(nullptr);
-    }
-
     // Find arguments from lgc.ilcps.getreturnvalue calls
     for (auto &I : F->getEntryBlock()) {
       if (auto *Intr = dyn_cast<lgc::ilcps::GetReturnValueOp>(&I)) {
@@ -447,7 +433,8 @@ void CleanupContinuationsPassImpl::processContinuations() {
   //    b.) change the address space for cps stack to 32.
   // 2. prepare arguments passed to cps.jump and insert the call at the exit of
   //    start part.
-  // 3. Edit resume signature to add the state/rcr/shader-index/returnvalues.
+  // 3. Edit resume signature to add the state and the return values. This adds the shader record index and the return
+  //    address to the function signature.
   SmallVector<Function *> ToErase;
   for (auto &FuncData : ToProcess) {
     LLVM_DEBUG(dbgs() << "Processing function: " << FuncData.first->getName() << "\n");
@@ -632,11 +619,6 @@ void CleanupContinuationsPassImpl::handleSingleContinue(ContinuationData &Data,
   SmallVector<Value *> TailArgs;
   Value *ResumeAddr = nullptr;
   Value *CR = nullptr;
-  unsigned LevelImm = -1;
-
-  uint32_t SkipCount = 2;
-  if (ContHelper::isLgcCpsModule(*Call->getModule()))
-    SkipCount = ContHelper::isWaitAwaitCall(*Call) ? 3 : 2;
 
   if (lgc::rt::getLgcRtShaderStage(Call->getFunction()) != lgc::rt::RayTracingShaderStage::KernelEntry) {
     ResumeAddr = Builder.create<cps::AsContinuationReferenceOp>(ResumeFun);
@@ -646,15 +628,14 @@ void CleanupContinuationsPassImpl::handleSingleContinue(ContinuationData &Data,
   }
 
   CR = Call->getArgOperand(0);
-  TailArgs.append(SmallVector<Value *>(drop_begin(Call->args(), SkipCount)));
+  TailArgs.append(SmallVector<Value *>(drop_begin(Call->args(), 3)));
 
-  if (lgc::cps::isCpsFunction(*Call->getFunction())) {
-    Value *Level = Call->getArgOperand(SkipCount - 1);
-    LevelImm = cast<ConstantInt>(Level)->getZExtValue();
-  }
+  Value *ShaderIndex = Call->getArgOperand(2);
+  Value *Level = Call->getArgOperand(1);
+  uint32_t LevelImm = cast<ConstantInt>(Level)->getZExtValue();
 
   auto *Csp = PoisonValue::get(Builder.getInt32Ty());
-  auto *JumpCall = Builder.create<cps::JumpOp>(CR, LevelImm, Csp, ResumeAddr, TailArgs);
+  auto *JumpCall = Builder.create<cps::JumpOp>(CR, LevelImm, Csp, ShaderIndex, ResumeAddr, TailArgs);
   // Replace this instruction with a call to cps.jump.
   JumpCall->copyMetadata(*Call);
 
@@ -678,10 +659,7 @@ void CleanupContinuationsPassImpl::lowerIntrinsicCall(Function *F, ContinuationD
     return;
 
   CompilerUtils::CrossModuleInliner CrossInliner;
-  // Signature of cps function: { rcr, shader-index, system-data}
-  const uint32_t SystemDataArgIdx = lgc::cps::isCpsFunction(*F) ? CpsArgIdx::SystemData : 1;
-
-  Value *SystemDataArg = F->getArg(SystemDataArgIdx);
+  Value *SystemDataArg = F->getArg(CpsArgIdx::SystemData);
   Type *SystemDataTy = SystemDataArg->getType();
 
   Builder.SetInsertPointPastAllocas(F);
@@ -807,7 +785,7 @@ void CleanupContinuationsPassImpl::handleContStackIntrinsic(FunctionAnalysisMana
       Replacement = Builder.create<lgc::cps::GetVspOp>();
     } else if (FuncName.starts_with("Load")) {
       Value *Addr = ConstantFoldInstruction(CInst.getFunction(), CInst.getArgOperand(0));
-      Value *Ptr = Builder.CreateIntToPtr(Addr, CInst.getType()->getPointerTo(lgc::cps::stackAddrSpace));
+      Value *Ptr = Builder.CreateIntToPtr(Addr, Builder.getPtrTy(lgc::cps::stackAddrSpace));
       Replacement = Builder.CreateAlignedLoad(DestTy, Ptr, Align(CpsStackLowering::getContinuationStackAlignment()));
 
       if (FuncName.starts_with("LoadLastUse"))
@@ -820,7 +798,7 @@ void CleanupContinuationsPassImpl::handleContStackIntrinsic(FunctionAnalysisMana
 
       Value *Addr = ConstantFoldInstruction(CInst.getFunction(), CInst.getArgOperand(0));
       Value *Val = CInst.getArgOperand(1);
-      Value *Ptr = Builder.CreateIntToPtr(Addr, Val->getType()->getPointerTo(lgc::cps::stackAddrSpace));
+      Value *Ptr = Builder.CreateIntToPtr(Addr, Builder.getPtrTy(lgc::cps::stackAddrSpace));
       Builder.CreateAlignedStore(Val, Ptr, Align(CpsStackLowering::getContinuationStackAlignment()));
 
       IsMemoryAccess = true;
@@ -862,19 +840,17 @@ void CleanupContinuationsPassImpl::lowerGetResumePoint(Module &Mod) {
 
       // Re-create the lgc.cps.jump call without the return address
       // argument, since the calling code handles it manually.
-      if (!lgc::cps::isCpsFunction(*Jump->getFunction())) {
-        SmallVector<Value *> Args;
-        for (unsigned I = 0; I < Jump->arg_size(); I++) {
-          if (I != 3) // Return address argument
-            Args.push_back(Jump->getArgOperand(I));
-        }
+      SmallVector<Value *> Args;
+      for (unsigned I = 0; I < Jump->arg_size(); I++) {
+        if (I != 4) // Return address argument
+          Args.push_back(Jump->getArgOperand(I));
+      }
 
-        Builder.SetInsertPoint(Jump);
-        auto *NewCall = Builder.CreateCall(Jump->getCalledFunction(), Args);
-        NewCall->copyMetadata(*Jump);
+      Builder.SetInsertPoint(Jump);
+      auto *NewCall = Builder.CreateCall(Jump->getCalledFunction(), Args);
+      NewCall->copyMetadata(*Jump);
 
-        Jump->eraseFromParent();
-      }
+      Jump->eraseFromParent();
     }
   }
 }
@@ -912,11 +888,8 @@ llvm::PreservedAnalyses CleanupContinuationsPassImpl::run() {
           ContFrame = F->getArg(F->arg_size() - 1);
         else
           ContFrame = F->getArg(0);
-        if (!ContFrame->user_empty()) {
+        if (!ContFrame->user_empty())
           FuncData.second.ContStateBytes = MinimumContinuationStateBytes;
-          if (MinimumContinuationStateBytes > MaxContStateBytes)
-            MaxContStateBytes = MinimumContinuationStateBytes;
-        }
       }
     }
   }
diff --git a/llvmraytracing/lib/ContStateBuilder.cpp b/llvmraytracing/lib/ContStateBuilder.cpp
new file mode 100644
index 0000000000..974daa7b38
--- /dev/null
+++ b/llvmraytracing/lib/ContStateBuilder.cpp
@@ -0,0 +1,1369 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+//===- ContStateBuilder.cpp - A custom ABI for LLVM Coroutines---------------===//
+//
+// This file defines Continuations Passing Style Return-Continuation ABI for
+// LLVM coroutine transforms that is used to build the cont-state.
+//===----------------------------------------------------------------------===//
+
+#include "ContStateBuilder.h"
+#include "compilerutils/CompilerUtils.h"
+#include "compilerutils/IRSerializationUtils.h"
+#include "llvmraytracing/ContinuationsUtil.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/ModuleSlotTracker.h"
+#include "llvm/IR/NoFolder.h"
+#include "llvm/IR/TypedPointerType.h"
+#include "llvm/Support/OptimizedStructLayout.h"
+#include "llvm/Transforms/Coroutines/SpillUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+
+#define DEBUG_TYPE "cont-state-builder"
+#define DEBUG_DUMP_CFG(FUNC, MSG)                                                                                      \
+  DEBUG_WITH_TYPE("cont-state-cfg-dump", irserializationutils::writeCFGToDotFile(FUNC, MSG))
+
+using namespace llvm;
+using namespace llvmraytracing;
+
+static cl::opt<bool> ReportContStateAccessCounts(
+    "report-cont-state-access-counts",
+    cl::desc("Report on the number of spills (stores) and reloads (loads) from the cont state."), cl::init(false),
+    cl::Hidden);
+
+#ifndef NDEBUG
+// When debugging a potential issue with the cont-state-builder try setting
+// this option to verify the issue resides within the builder.
+static cl::opt<bool> UseLLVMContStateBuilder("use-llvm-cont-state-builder",
+                                             cl::desc("Use LLVM's built-in continuation state builder."),
+                                             cl::init(false), cl::Hidden);
+#endif
+
+namespace {
+
+// Representation of a row in the frame-table.
+struct CoroFrameRow {
+  CoroFrameRow(const DataLayout &DL, Value *D) : Def(D) {
+    // Determine alignment of Def
+    if (auto *AI = dyn_cast<AllocaInst>(Def)) {
+      Ty = AI->getAllocatedType();
+
+      // Make an array type if this is a static array allocation.
+      if (AI->isArrayAllocation()) {
+        if (auto *CI = dyn_cast<ConstantInt>(AI->getArraySize()))
+          Ty = ArrayType::get(Ty, CI->getValue().getZExtValue());
+        else
+          report_fatal_error("Continuations cannot handle non static allocas yet");
+      }
+      assert(Ty && "must provide a type for a field");
+
+      // The field size is always the alloc size of the type.
+      Size = DL.getTypeAllocSize(Ty);
+      assert(Size);
+
+      Alignment = MaybeAlign(AI->getAlign()).value_or(DL.getABITypeAlign(Ty));
+      return;
+    }
+
+    Ty = Def->getType();
+
+    assert(Ty && "must provide a type for a field");
+    Alignment = DL.getABITypeAlign(Ty);
+
+    // The field size is always the alloc size of the type.
+    Size = DL.getTypeStoreSize(Ty);
+    assert(Size);
+  }
+
+  // The original definition of the value or an alloca
+  Value *Def = nullptr; // May be an instruction or arg
+
+  // Suspend is in the set if the Def resides in the frame associated with the
+  // suspend. The value does not necessarily cross the suspend.
+  SmallSet<AnyCoroSuspendInst *, 2> ResidesInSuspendFrame;
+
+  // Offset of value (wrt to this row), in the frame. Note that a value may
+  // occupy different parts of a frame if it is respilled. To handle that
+  // case one row per frame slot is used. OptimizedStructLayout is used for
+  // frame-opt=min-gap so we initialize the value to FlexibleOffset.
+  uint64_t Offset = OptimizedStructLayoutField::FlexibleOffset;
+
+  // Alignment is either the type's alignment or the alloca's alignment.
+  Align Alignment;
+
+  // Size of field in bytes required for Def
+  uint64_t Size = 0;
+
+  // Type of Def, for AllocaInst this is the alloca type
+  Type *Ty = nullptr;
+
+  // True if 'spill-on=def' and value has been spilled.
+  bool SpilledOnDef = false;
+  // Block is in set if value is reloaded there.
+  SmallSet<BasicBlock *, 2> ReloadedOnBB;
+  // Block is in set if a GEP has been generated for the value there.
+  SmallMapVector<BasicBlock *, GetElementPtrInst *, 2> GepInBB;
+
+  // Set of all spill instructions, required for SSA updating.
+  SmallVector<StoreInst *, 2> Spills;
+  // Set of all reload instructions, required for SSA updating.
+  SmallVector<LoadInst *, 2> Reloads;
+
+  // Note: Reloads and spills are added for one suspend at a time. So if it is
+  // necessary to know the reloads or spills associated with a given suspend we
+  // only need to know their start and end indices within the vectors. We take
+  // advantage of this when removing dominate reloads. The start,end pairs per
+  // suspend are not currently recorded.
+
+  void dump() const;
+};
+
+using CoroFrameTableTy = std::vector<CoroFrameRow>;
+
+struct CoroFrameStruct {
+  // Note, although each suspend has a different struct layout only one malloc
+  // is done for the coroutine. If fields don't move from suspend to suspend
+  // then they don't need to be respilled.
+
+  // Struct layout, optimized by LLVM's OptimizedStructLayout
+  SmallVector<OptimizedStructLayoutField, 8> Fields;
+
+  // Alignment of the frame
+  Align Alignment;
+
+  // Size of frame in bytes
+  uint64_t Size = 0;
+
+  // Suspend and resume BBs
+  BasicBlock *SuspendBB;
+  BasicBlock *ResumeBB;
+
+  // Crossing values checker
+  std::unique_ptr<SuspendCrossingInfo> Checker;
+
+  // SmallMapVector from a spill candidate to a list of its crossing uses.
+  coro::SpillInfo CandidateSpills;
+
+  // AllocaInfo includes aliases for crossing allocas.
+  SmallVector<coro::AllocaInfo, 8> CandidateAllocas;
+
+  void dumpField(const OptimizedStructLayoutField &F, const CoroFrameTableTy &) const;
+  void dump(const CoroFrameTableTy &) const;
+};
+
+class ContStateBuilderImpl {
+public:
+  Function &F;
+  coro::Shape &Shape;
+  std::function<bool(Instruction &I)> IsMaterializable;
+
+  Module &M;
+  const DataLayout &DL;
+
+  ContStateBuilderImpl(Function &F, coro::Shape &S, std::function<bool(Instruction &I)> IsMaterializable)
+      : F(F), Shape(S), IsMaterializable(IsMaterializable), M(*F.getParent()), DL(M.getDataLayout()) {}
+
+  // Allocate the coroutine frame and do spill/reload as needed.
+  void buildCoroutineFrame();
+
+  // Representation of the combination of all frames, in a table, required for
+  // the coroutine.
+  CoroFrameTableTy FrameTable;
+
+  // Representation of the frame for the current suspend. If the index is int
+  // max then the value is needed and a row will be added to the FrameTable.
+  // Otherwise, it is an existing entry in the FrameTable.
+  using DefRowMapTy = SmallMapVector<Value *, unsigned, 8>;
+
+  // Value to FrameTable Row map -- used to ensure a value always has the same
+  // location in the frame.
+  DefRowMapTy AllFrameValues;
+
+  // Map of the optimized struct and fields for each suspend's frame.
+  SmallMapVector<AnyCoroSuspendInst *, CoroFrameStruct, 2> FrameStructs;
+
+  // Used to allocate the frame with the size needed to handle the largest
+  // computed struct layout and determine if the inline storage is sufficient
+  // to hold the frame.
+  // Max Frame -- Largest frame required by the suspends.
+  // Max Alignment -- Largest individual field's alignment.
+  uint64_t MaxFrameSize = 0;
+  Align MaxFrameAlign;
+
+  // Helper for building the FrameTable, Count is incremented if a new value is inserted.
+  // Returns true if the Def is added, false if it already existed in the FrameTable.
+  bool tryInsertFrameTableRow(Value *Def);
+
+  // Go through candidate list and add values that are needed for the suspend
+  // to the frame. Note: the location in the frame is not yet finalized.
+  void addValuesToFrameTable(AnyCoroSuspendInst *Suspend, const coro::SpillInfo &CandidateSpills,
+                             const SmallVector<coro::AllocaInfo, 8> &CandidateAllocas);
+
+  // Make the rows reside in the given suspend's frame
+  void makeRowsResideInSuspendFrame(DefRowMapTy &ValueRows, AnyCoroSuspendInst *Suspend);
+
+  // Determine location of gaps in the current frame struct layout.
+  void initFrameStructLayout(AnyCoroSuspendInst *Suspend, CoroFrameStruct &Struct);
+
+  // Allocate fields according to program order.
+  void computeFrameStructLayoutGreedy(AnyCoroSuspendInst *Suspend, CoroFrameStruct &Struct, bool IsAlloca);
+
+  // Finalize the struct layout by sorting for spilling and reload, and
+  // determining the max frame size and alignments.
+  void finalizeFrameStructLayout(CoroFrameStruct &Struct);
+
+  // Create the frame type, its size is the maximum of the frame sizes
+  // required at each suspend.
+  StructType *createFrameTy() const;
+
+  // In the following spill and reload methods the new insts are added to the
+  // insts FrameRow::Reloads and FrameRow::Spills so we can build its phi node
+  // network later.
+
+  // Insert spills and reloads
+  void insertSpills(coro::Shape &Shape, DominatorTree &DT);
+  void insertReloads();
+
+  // With all spills and reloads in-place now we can generate the phi network
+  // that carries the values between defs and uses.
+  void buildPhiNetwork();
+
+  // Replace poisoned frame-address values with computed values
+  void createFrameGEPs(SmallVector<Instruction *, 4> &DeadInstructions);
+
+  // Remove unused reloads
+  void removeUnusedReloads();
+
+  // Report stats collected by FrameTable and FrameStruct data structures
+  void reportContStateInfo() const;
+};
+
+/// Return true if Def is an Arg with the ByVal attribute.
+[[maybe_unused]] static bool isArgByVal(Value *Def) {
+  if (auto *Arg = dyn_cast<Argument>(Def))
+    return Arg->hasByValAttr();
+  return false;
+}
+
+static std::string getLabel(Function *F) {
+  if (F->hasName())
+    return F->getName().str();
+  ModuleSlotTracker MST(F->getParent());
+  MST.incorporateFunction(*F);
+
+  return std::to_string(MST.getLocalSlot(F));
+}
+
+static std::string getLabel(BasicBlock *BB) {
+  if (BB->hasName())
+    return BB->getName().str();
+
+  Function *F = BB->getParent();
+
+  ModuleSlotTracker MST(F->getParent());
+  MST.incorporateFunction(*F);
+
+  return std::to_string(MST.getLocalSlot(BB));
+}
+
+static std::string getLabel(Value *V) {
+  if (V->hasName())
+    return V->getName().str();
+
+  if (!isa<Instruction>(V))
+    return "";
+
+  BasicBlock *BB = dyn_cast<Instruction>(V)->getParent();
+  Function *F = BB->getParent();
+
+  ModuleSlotTracker MST(F->getParent());
+  MST.incorporateFunction(*F);
+
+  return std::to_string(MST.getLocalSlot(V));
+}
+
+static std::string getAllNames(const SmallSet<BasicBlock *, 2> &List) {
+  std::string S;
+  if (List.empty())
+    return "<empty>";
+
+  for (BasicBlock *BB : List)
+    S = S + " %" + getLabel(BB);
+
+  return S;
+}
+
+void CoroFrameRow::dump() const {
+  if (Def) {
+    dbgs() << "\tDef: ";
+    LLVM_DEBUG(Def->dump());
+    if (isa<Instruction>(Def))
+      dbgs() << "\tDefBB: %" << getLabel(cast<Instruction>(Def)->getParent()) << "\n";
+    else if (isa<Argument>(Def))
+      dbgs() << "\tDefBB: %" << getLabel(cast<Argument>(Def)->getParent()) << "\n";
+    else
+      dbgs() << "\tDefBB: Unknown Value Type\n";
+  } else {
+    dbgs() << "\tDef: nullptr\n";
+    dbgs() << "\tDefBB: NA\n";
+  }
+  std::string OffsetStr =
+      Offset != OptimizedStructLayoutField::FlexibleOffset ? std::to_string(Offset) : std::string("Flexible");
+  dbgs() << "\tOffset: " << OffsetStr << ", " << Size << " bytes, ";
+  dbgs() << "Align: " << Alignment.value() << " bytes\n";
+  dbgs() << "\tTy: ";
+  LLVM_DEBUG(Ty->dump());
+  dbgs() << "\tResidesInSuspendFrames: " << ResidesInSuspendFrame.size() << "\n";
+  if (!isa<AllocaInst>(Def)) {
+    dbgs() << "\tSpilledOnDef: " << (SpilledOnDef ? "true" : "false") << "\n";
+    dbgs() << "\tReloadedOnBB: " << getAllNames(ReloadedOnBB) << "\n";
+    dbgs() << "\tSpills: " << Spills.size() << "\n";
+    dbgs() << "\tReloads: " << Reloads.size() << "\n";
+  }
+}
+
+void CoroFrameStruct::dumpField(const OptimizedStructLayoutField &F, const CoroFrameTableTy &FrameTable) const {
+  auto Idx = reinterpret_cast<long int>(F.Id);
+  const CoroFrameRow *Row = &FrameTable[Idx];
+  dbgs() << " Frame Table Row " << std::to_string(Idx);
+  if (isa<AllocaInst>(Row->Def))
+    dbgs() << " -- Alloca for %" << getLabel(Row->Def);
+  else if (isa<Argument>(Row->Def))
+    dbgs() << " -- Spill of Argument %" << getLabel(Row->Def);
+  else
+    dbgs() << " -- Spill of Inst %" << getLabel(Row->Def);
+
+  // Determine if value is a spill or alloca
+  if (auto *DefAlloca = dyn_cast<AllocaInst>(Row->Def)) {
+    auto I = std::find_if(CandidateAllocas.begin(), CandidateAllocas.end(),
+                          [DefAlloca](const coro::AllocaInfo &AI) { return AI.Alloca == DefAlloca; });
+    if (I == CandidateAllocas.end())
+      dbgs() << " -- Unused\n";
+    else
+      dbgs() << " -- Aliases: " << I->Aliases.size() << "\n";
+  } else {
+    if (!CandidateSpills.contains(Row->Def))
+      dbgs() << " -- Unused\n";
+    else
+      dbgs() << " -- Crossing Uses: " << CandidateSpills.lookup(Row->Def).size() << "\n";
+  }
+
+  if (F.hasFixedOffset()) {
+    dbgs() << "\t\tOffset: " << F.Offset << " -> " << F.getEndOffset() << ", " << F.Size << " bytes, ";
+    dbgs() << "Align: " << F.Alignment.value() << " bytes\n";
+  } else {
+    dbgs() << "\t\tOffset: <flexible>\n";
+  }
+}
+
+void CoroFrameStruct::dump(const CoroFrameTableTy &FrameTable) const {
+  dbgs() << "\tFields: \n";
+  unsigned idx = 0;
+  for (const auto &F : Fields) {
+    dbgs() << "\tField " << idx++ << ":";
+    dumpField(F, FrameTable);
+  }
+  dbgs() << "\tFrameStruct Size: " << Size << " bytes, ";
+  dbgs() << "Align: " << Alignment.value() << " bytes\n";
+  std::string SuspendBBName = SuspendBB ? getLabel(SuspendBB) : "nullptr";
+  dbgs() << "\tSuspendBB: %" << SuspendBBName << "\n";
+  std::string ResumeBBName = ResumeBB ? getLabel(ResumeBB) : "nullptr";
+  dbgs() << "\tResumeBB: %" << ResumeBBName << "\n";
+}
+
+bool ContStateBuilderImpl::tryInsertFrameTableRow(Value *Def) {
+  auto Idx = FrameTable.size();
+
+  assert(Def);
+  auto [Itr, Inserted] = AllFrameValues.try_emplace(Def, Idx);
+
+  if (Inserted) {
+    // Add new value
+    FrameTable.emplace_back(CoroFrameRow(DL, Def));
+  } else {
+    // Reuse existing value
+    Idx = Itr->second;
+  }
+
+  return Inserted;
+}
+
+void ContStateBuilderImpl::addValuesToFrameTable(AnyCoroSuspendInst *Suspend, const coro::SpillInfo &CandidateSpills,
+                                                 const SmallVector<coro::AllocaInfo, 8> &CandidateAllocas) {
+  [[maybe_unused]] unsigned NewArgBytes = 0;
+  [[maybe_unused]] unsigned NewInstBytes = 0;
+  [[maybe_unused]] unsigned NewAllocaBytes = 0;
+
+  // Add candidate spills. For each suspend that the value crosses it will be
+  // added to its frame. The def will be spilled to the frame and a load from
+  // the frame will occur before uses where the def-use crosses the suspend.
+  for (auto &[Def, Aliases] : CandidateSpills) {
+    if (tryInsertFrameTableRow(Def)) {
+      // Statistics collection
+      LLVM_DEBUG({
+        auto Idx = AllFrameValues.lookup(Def);
+        auto &Row = FrameTable[Idx];
+        if (isArgByVal(Def))
+          llvm_unreachable("ByVal Args are unsupported");
+        else if (isa<Argument>(Def))
+          NewArgBytes += Row.Size;
+        else
+          NewInstBytes += Row.Size;
+      });
+    }
+  }
+
+  for (auto &AI : CandidateAllocas) {
+    // Note: CandidateAllocas have already been determined to cross a suspend.
+    // We can also assume that sinkSpillUsesAfterCoroBegin moved all uses to
+    // after the CoroBegin.
+
+    if (tryInsertFrameTableRow(AI.Alloca)) {
+      // Statistics collection
+      LLVM_DEBUG({
+        auto Idx = AllFrameValues.lookup(AI.Alloca);
+        auto &Row = FrameTable[Idx];
+        NewAllocaBytes += Row.Size;
+      });
+    }
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "\tNew Alloca Bytes: " << NewAllocaBytes << "\n";
+    dbgs() << "\tNew Arg Spill Bytes: " << NewArgBytes << "\n";
+    dbgs() << "\tNew Inst Spill Bytes: " << NewInstBytes << "\n";
+  });
+
+  // Adding AllFrameValues rows to the given suspend's frame will prevent
+  // values that are no longer needed from being overwritten.
+  makeRowsResideInSuspendFrame(AllFrameValues, Suspend);
+}
+
+void ContStateBuilderImpl::makeRowsResideInSuspendFrame(DefRowMapTy &FrameValues, AnyCoroSuspendInst *Suspend) {
+  // Add this Suspend point to ResidesInSuspendFrame for all frame rows.
+  for (const auto &[Def, Idx] : FrameValues) {
+    auto &Row = FrameTable[Idx];
+    Row.ResidesInSuspendFrame.insert(Suspend);
+  }
+}
+
+// Check if Def value crosses the suspend. Note this check is used instead of
+// checking the ResidesInSuspendFrame set because if eviction is not enabled
+// then the ResidesInSuspendFrame set will include all suspends.
+static bool isSuspendCrossingValue(Value *Def, const coro::SpillInfo &CandidateSpills,
+                                   const SmallVector<coro::AllocaInfo, 8> &CandidateAllocas) {
+  if (auto *DefAlloca = dyn_cast<AllocaInst>(Def)) {
+    auto II = std::find_if(CandidateAllocas.begin(), CandidateAllocas.end(),
+                           [DefAlloca](const coro::AllocaInfo &AI) { return AI.Alloca == DefAlloca; });
+    return II != CandidateAllocas.end();
+  }
+
+  return CandidateSpills.contains(Def);
+}
+
+static void fitNewField(Value *Val, OptimizedStructLayoutField &NewField, CoroFrameStruct &Struct) {
+  NewField.Offset = 0; // If Fields is empty, start at offset 0
+  if (!Struct.Fields.empty()) {
+    NewField.Offset = alignTo(Struct.Size, NewField.Alignment);
+    assert(NewField.Offset >= Struct.Size);
+  }
+
+  Struct.Size = NewField.getEndOffset();
+
+  if (Struct.Alignment < NewField.Alignment)
+    Struct.Alignment = NewField.Alignment;
+
+  Struct.Fields.emplace_back(NewField);
+}
+
+void ContStateBuilderImpl::initFrameStructLayout(AnyCoroSuspendInst *Suspend, CoroFrameStruct &Struct) {
+  assert(Struct.Fields.empty());
+
+  // First add fields that have already been located (fixed offset fields).
+  for (auto R : llvm::enumerate(FrameTable)) {
+    auto &Row = R.value();
+    // Notice we include all values that have this Suspend in their
+    // ResidesInSuspendFrame set. That will ensure all values currently held in
+    // the frame will be added as a Field. If eviction is enabled that set will
+    // only include values that are used across the suspend.
+    if (Row.Offset != OptimizedStructLayoutField::FlexibleOffset && Row.ResidesInSuspendFrame.contains(Suspend)) {
+      // Value is in this frame, create a 'field' for it.
+      void *Idx = reinterpret_cast<void *>(R.index());
+      Struct.Fields.emplace_back(Idx, Row.Size, Row.Alignment, Row.Offset);
+    }
+  }
+
+  // Sort the fixed offset fields to identify gaps between existing values.
+  llvm::sort(
+      Struct.Fields.begin(), Struct.Fields.end(),
+      [&](const OptimizedStructLayoutField &A, const OptimizedStructLayoutField &B) { return A.Offset < B.Offset; });
+
+  // After sorting last element in Fields is the last in memory.
+  if (!Struct.Fields.empty())
+    Struct.Size = Struct.Fields.back().getEndOffset();
+}
+
+void ContStateBuilderImpl::computeFrameStructLayoutGreedy(AnyCoroSuspendInst *Suspend, CoroFrameStruct &Struct,
+                                                          bool IsAlloca) {
+  // Add flexible fields into the gaps
+  for (auto R : llvm::enumerate(FrameTable)) {
+    auto &Row = R.value();
+    // Only layout non-alloca, skip alloca - they are laid out separately
+    if (!IsAlloca && isa<AllocaInst>(Row.Def))
+      continue;
+
+    // Only layout alloca, skip non-alloca - they are laid out separately
+    if (IsAlloca && !isa<AllocaInst>(Row.Def))
+      continue;
+
+    if (Row.Offset == OptimizedStructLayoutField::FlexibleOffset && Row.ResidesInSuspendFrame.contains(Suspend)) {
+      // Value is in this frame, create a 'field' for it.
+      void *Idx = reinterpret_cast<void *>(R.index());
+      OptimizedStructLayoutField NewField = {Idx, Row.Size, Row.Alignment, Row.Offset};
+      fitNewField(Row.Def, NewField, Struct);
+      assert(NewField.Offset != OptimizedStructLayoutField::FlexibleOffset);
+      assert(NewField.Offset == alignTo(NewField.Offset, NewField.Alignment));
+
+      // Update the offsets in the FrameTable
+      Row.Offset = NewField.Offset;
+    }
+  }
+}
+
+void ContStateBuilderImpl::finalizeFrameStructLayout(CoroFrameStruct &Struct) {
+  // Sort the fields so spills and reloads are created in sequenced such that
+  // their offsets are in increasing order.
+  llvm::sort(
+      Struct.Fields.begin(), Struct.Fields.end(),
+      [&](const OptimizedStructLayoutField &A, const OptimizedStructLayoutField &B) { return A.Offset < B.Offset; });
+
+  assert(Struct.Fields.empty() || Struct.Fields.back().getEndOffset() == Struct.Size);
+
+  // Record the largest frame required by the coroutine
+  if (MaxFrameSize < Struct.Size)
+    MaxFrameSize = Struct.Size;
+
+  if (MaxFrameAlign < Struct.Alignment)
+    MaxFrameAlign = Struct.Alignment;
+}
+
+StructType *ContStateBuilderImpl::createFrameTy() const {
+  // TODO - when allocating the array (by user) the alignment may need to be
+  // corrected, this can be done by over-allocating e.g. size+alignment-1,
+  // then offsetting the start ptr to correct the alignment.
+
+  LLVMContext &C = F.getContext();
+
+  // Create a structure -- LLVM's CoroFrame builds a real struct with types
+  // that match the values for its frame. Here we build a struct with a sized
+  // array and index into that using the provided offsets. We do this for
+  // several reasons:
+  // 1) At each suspend we want the frame to have only the required fields,
+  //    unused fields should be allowed to be overwritten by any other field,
+  //    no matter if the types match. However, typed struct fields make this
+  //    more difficult, potentially requiring a different struct type per
+  //    suspend point.
+  // 2) Notice that offsets into the frame are computed first (above) then
+  //    the frame type is created. LLVM's CoroFrame then builds a struct with
+  //    typed fields. However, the struct type layout is a different method
+  //    than the struct field optimizer and thus may have a different padding
+  //    between fields. This could introduce alignment errors and
+  //    out-of-bounds accesses.
+  // 3) It is necessary to add padding to the struct type to avoid the above
+  //    fragility, however, that changes the index of the fields. This must be
+  //    tracked and is another potential point of failure.
+  // 4) The array is wrapped in a struct so it can be given a name, otherwise
+  //    it is not possible to give a stand-alone array type a name.
+  //
+  auto Name = F.getName() + ".Frame";
+
+  Type *ByteArray = ArrayType::get(Type::getInt8Ty(C), MaxFrameSize);
+  StructType *FrameType = StructType::create(C, {ByteArray}, Name.str());
+
+  // Verify the struct type is the right size, i.e. no padding was added.
+  assert(DL.getTypeAllocSize(FrameType) == MaxFrameSize);
+
+  return FrameType;
+}
+
+void ContStateBuilderImpl::insertSpills(coro::Shape &Shape, DominatorTree &DT) {
+  LLVMContext &C = F.getContext();
+  IRBuilder<> Builder(C);
+
+  // Determine if the spill is needed for this def and set the insertion pt.
+  auto SetInsertPtIfRequired = [&](CoroFrameRow &Row) {
+    if (!Row.SpilledOnDef) {
+      auto I = coro::getSpillInsertionPt(Shape, Row.Def, DT);
+      Builder.SetInsertPoint(I);
+      Row.SpilledOnDef = true;
+      return true;
+    }
+
+    return false;
+  };
+
+  for (auto &I : FrameStructs) {
+    auto &Struct = I.second;
+
+    // For each value in the frame insert spill, if they do not already exist.
+    // Note: the location in the frame will be set when GEPs are built later
+    // for now the addresses are poisoned.
+
+    // Visit each field in the struct and create spills as needed. Visit fields
+    // in reverse order to cause the spills to occur in-order after creation.
+    for (auto &Field : llvm::reverse(Struct.Fields)) {
+      auto Idx = reinterpret_cast<long int>(Field.Id);
+      CoroFrameRow &Row = FrameTable[Idx];
+      Value *Def = Row.Def;
+
+      // Allocas in the frame do not require spilling.
+      if (isa<AllocaInst>(Def))
+        continue;
+
+      // Do not spill here if the value does not cross this suspend. Note
+      // this check is needed when eviction is not used. Without eviction
+      // the frame will include values that do not cross it and we should
+      // not spill the value on suspends the value does not cross. That
+      // will lead to excess spilling and incorrect codegen.
+      if (!isSuspendCrossingValue(Row.Def, Struct.CandidateSpills, Struct.CandidateAllocas))
+        continue;
+
+      if (!SetInsertPtIfRequired(Row))
+        continue;
+
+      // Generate a frame address of the Def, poison for now
+      Value *PoisonFrameAddr = PoisonValue::get(PointerType::get(C, 0));
+
+      // Generate spill for Def
+      StoreInst *Spill = Builder.CreateAlignedStore(Def, PoisonFrameAddr, Row.Alignment);
+
+      // Record spill so we can build the phi node network and fix the frame
+      // address later.
+      assert(Spill);
+      Row.Spills.emplace_back(Spill);
+    }
+  }
+}
+
+void ContStateBuilderImpl::insertReloads() {
+  LLVMContext &C = F.getContext();
+  IRBuilder<> Builder(C);
+
+  // Determine if a reload is needed for this use and set the insertion pt.
+  auto SetInsertPtIfRequired = [&](CoroFrameRow &Row, User *U) {
+    auto *UseBB = cast<Instruction>(U)->getParent();
+
+    Builder.SetInsertPoint(UseBB, UseBB->getFirstInsertionPt());
+    // Mark the reloaded BB so we don't reload it a second time
+    auto R = Row.ReloadedOnBB.insert(UseBB);
+    return R.second; // False if UseBB already existed in the set.
+  };
+
+  // Generate a frame address of the Def, poison for now.
+  Value *PoisonFrameAddr = PoisonValue::get(PointerType::get(C, 0));
+
+  for (auto &I : FrameStructs) {
+    auto &Struct = I.second;
+
+    // For each value in the frame insert reloads, if they do not already
+    // exist. Note: the location in the frame will be set when GEPs are built
+    // later for now the addresses are poisoned. Note: not all uses of the
+    // value can be set because the phi node network that connects the new defs
+    // must be created.
+
+    // Visit each field in the struct and create reloads as needed. Visit the
+    // in reverse order to cause the reloads to occur in-order after creation.
+    for (auto &Field : llvm::reverse(Struct.Fields)) {
+      auto Idx = reinterpret_cast<long int>(Field.Id);
+      CoroFrameRow &Row = FrameTable[Idx];
+      Value *Def = Row.Def;
+
+      // Allocas in the frame do not require reloading
+      if (isa<AllocaInst>(Def))
+        continue;
+
+      // Do not reload here if the value does not cross this suspend. Note this
+      // check is needed when eviction is not used. Without eviction the frame
+      // will include values that do not cross it and we should not reload the
+      // value on suspends the value does not cross. That will lead to excess
+      // reloading and incorrect codegen.
+      if (!isSuspendCrossingValue(Row.Def, Struct.CandidateSpills, Struct.CandidateAllocas))
+        continue;
+
+      auto &SpillUses = Struct.CandidateSpills[Def];
+
+      // Helper to connect a reload to the uses of this Def if the use and the
+      // reload are in the same BB.
+      auto ConnectReloadToUses = [&](LoadInst *Reload) {
+        for (auto *U : SpillUses) {
+          // If the Reload and the Use are in the same BB then relink the Use to
+          // the Reload. This is done here because SSA Updater cannot easily
+          // produce a value if the Use is in the same BB. This is also a compiler-
+          // time optimization because it eliminates the need to invoke SSA Updater
+          // for this Use.
+          auto *Inst = cast<Instruction>(U);
+          if (Reload->getParent() == Inst->getParent() && !isa<PHINode>(U)) {
+            Inst->replaceUsesOfWith(Def, Reload);
+          }
+        }
+      };
+
+      // If we didn't generate a reload-on-resume then try to generate reloads
+      // on (near) each use.
+      for (auto *U : SpillUses) {
+        if (!SetInsertPtIfRequired(Row, U))
+          continue;
+
+        // Generate reload for Def
+        auto *CurrentReload = Builder.CreateAlignedLoad(Row.Ty, PoisonFrameAddr, Row.Alignment,
+                                                        Twine("reload.row") + std::to_string(Idx) + Twine(".") +
+                                                            Row.Def->getName() + Twine("."));
+
+        // Record the reload so we can build the phi node network and fix the frame
+        // address later.
+        Row.Reloads.emplace_back(CurrentReload);
+      }
+
+      // Connect immediate uses
+      for (auto &Reload : Row.Reloads) {
+        ConnectReloadToUses(Reload);
+      }
+    }
+  }
+}
+
+void ContStateBuilderImpl::buildPhiNetwork() {
+  LLVMContext &C = F.getContext();
+  [[maybe_unused]] Value *PoisonFrameAddr = PoisonValue::get(PointerType::get(C, 0));
+
+  // For each value collect all defs and reloads (available values)
+  // Then go back and fix up all spills and uses using SSA Updater.
+  for (CoroFrameRow &Row : FrameTable) {
+    // We don't need to build the phi node network for allocas because their
+    // loads already inserted by the user.
+    if (isa<AllocaInst>(Row.Def))
+      continue;
+
+    // Setup the SSAUpdater
+    SSAUpdater Updater;
+    Updater.Initialize(Row.Ty, Row.Def->getName());
+
+    // Add the original def and the materialized defs so SSAUpdater has all
+    // available definitions of the value.
+    if (auto *OldInst = dyn_cast<Instruction>(Row.Def))
+      Updater.AddAvailableValue(OldInst->getParent(), OldInst);
+    else if (auto *OldArg = dyn_cast<Argument>(Row.Def))
+      Updater.AddAvailableValue(&OldArg->getParent()->getEntryBlock(), OldArg);
+    else
+      llvm_unreachable("Unhandled type");
+
+    // Reloads are new definitions of the same value
+    for (LoadInst *ReloadInst : Row.Reloads)
+      Updater.AddAvailableValue(ReloadInst->getParent(), ReloadInst);
+
+    // Copy because GetValueAtEndOfBlock will introduce additional users of
+    // the def (PHINodes).
+    SmallVector<User *, 2> DefUsers(Row.Def->users());
+
+    // All users of Def are visited here to ensure all SSA uses have a proper
+    // phi node network connecting it to the nearest def/reload.
+
+    // This case is rather simple, because we know the value must cross a
+    // suspend, and all remats should be done either on resume or right before
+    // any uses of old def so we can assume the value should be live-out.
+    for (User *U : DefUsers) {
+      auto *DefUse = cast<Instruction>(U);
+      auto *DefUseBB = DefUse->getParent();
+
+      // Check that the user is not a spill that we inserted.
+      if (auto *DefUseSI = dyn_cast<StoreInst>(DefUse)) {
+        auto It = std::find(Row.Spills.begin(), Row.Spills.end(), DefUseSI);
+        // If the DefUse is a spill we inserted, skip it, we already hooked it up.
+        if (It != Row.Spills.end()) {
+          // Our spills have a poison address at this point.
+          assert(DefUseSI->getPointerOperand() == PoisonFrameAddr);
+          continue;
+        }
+      }
+
+      // If the user is a PHI node, it should be a single-edge phi node and we
+      // can replace its uses with the new definition.
+      if (auto *PN = dyn_cast<PHINode>(DefUse)) {
+        assert(PN->getNumIncomingValues() == 1 && "unexpected number of incoming "
+                                                  "values in the PHINode");
+
+        if (!PN->use_empty()) {
+          Value *NewDef = Updater.GetValueAtEndOfBlock(DefUseBB);
+          PN->replaceAllUsesWith(NewDef);
+        }
+
+        // Now the phi node is dead
+        PN->eraseFromParent();
+        continue;
+      }
+
+      // For non phi-nodes we replace the uses of the old def with the new def.
+      Value *NewDef = nullptr;
+      for (unsigned i = 0, E = DefUse->getNumOperands(); i != E; ++i) {
+        if (DefUse->getOperand(i) == Row.Def) {
+          if (!NewDef)
+            NewDef = Updater.GetValueAtEndOfBlock(DefUseBB);
+          DefUse->setOperand(i, NewDef);
+        }
+      }
+    }
+  }
+}
+
+void ContStateBuilderImpl::createFrameGEPs(SmallVector<Instruction *, 4> &DeadInstructions) {
+  LLVMContext &C = F.getContext();
+  IRBuilder<NoFolder> Builder(C);
+
+  // Replace the poison on the spills and reloads with GEPs into the frame.
+  Value *PoisonFrameAddr = PoisonValue::get(PointerType::get(C, 0));
+
+  // Insertion point for GEP that replaces alloca
+  BasicBlock *FramePtrBB = Shape.getInsertPtAfterFramePtr()->getParent();
+
+  // Split the FramePtrBB to add a 'spill' block immediately following the
+  // frame ptr.
+  auto SpillBlock = FramePtrBB->splitBasicBlock(Shape.getInsertPtAfterFramePtr(), Twine("AllocaSpillBB"));
+  SpillBlock->splitBasicBlock(&SpillBlock->front(), Twine("PostSpill.") + FramePtrBB->getName());
+  Shape.AllocaSpillBlock = SpillBlock;
+
+  // Each suspend corresponds to a potentially unique frame
+  for (auto &I : FrameStructs) {
+    [[maybe_unused]] auto *Suspend = I.first;
+    auto &Struct = I.second;
+    // Visit each field in the struct and create reloads as needed. Visit the
+    // fields in reverse order to cause the reloads to occur in-order after
+    // creation.
+    for (auto &Field : llvm::reverse(Struct.Fields)) {
+      auto Idx = reinterpret_cast<long int>(Field.Id);
+      CoroFrameRow &Row = FrameTable[Idx];
+
+      assert(Row.ResidesInSuspendFrame.contains(Suspend));
+      assert(Row.Offset != OptimizedStructLayoutField::FlexibleOffset);
+
+      auto TryReuseGep = [&](BasicBlock *BB, BasicBlock::iterator InsertPt, const Twine &Label, StringRef Name) {
+        auto [Itr, Inserted] = Row.GepInBB.try_emplace(BB, nullptr);
+
+        // Add a new GEP if the BB is not in the map
+        if (!Inserted) {
+          // Get GEP from map
+          assert(Itr->second);
+          return Itr->second;
+        }
+
+        // Set the insert pt of the GEP
+        Builder.SetInsertPoint(InsertPt);
+
+        // FrameTy is a struct containing an array of int8, i.e.
+        //  struct value_frame { char data[size]; };
+        // FramePtr will be replaced by an alloca of the right size, i.e.
+        // Accesses to the frame will look like
+        //  v->data[Row.Offset];
+        // So this translates to indices {
+        //  0,  <- frame ptr is not an array, we don't index into it
+        //  0,  <- accessing the first member (data) in the struct
+        //  Row.Offset <- accessing an element of the data array
+        // }
+        Value *Idxs[] = {ConstantInt::get(Type::getInt32Ty(C), 0), ConstantInt::get(Type::getInt32Ty(C), 0),
+                         ConstantInt::get(Type::getInt32Ty(C), Row.Offset)};
+
+        // GEP replacing alloca
+        auto *GepInst = Builder.CreateInBoundsGEP(Shape.FrameTy, Shape.FramePtr, Idxs,
+                                                  Label + Twine(".addr.row") + std::to_string(Idx) + Twine(".") + Name +
+                                                      Twine("."));
+        Itr->second = dyn_cast<GetElementPtrInst>(GepInst);
+
+        return Itr->second;
+      };
+
+      // Fix allocas that are taken over by the frame. Note that allocas that
+      // do not cross suspends are not included in the FrameTable.
+      if (auto *Alloca = dyn_cast<AllocaInst>(Row.Def)) {
+        // Insert a GEP to replace the alloca immediately after the malloc of
+        // the coro frame to ensure all accesses are dominated by the GEP.
+        // Insert at the end of the spill block.
+        auto *GepInst =
+            TryReuseGep(SpillBlock, SpillBlock->getTerminator()->getIterator(), Twine("alloca"), Alloca->getName());
+
+        // Note: that the location of the GEP is not be the same as that of
+        // the alloca. The GEP is put into the SpillBlock. The SpillBlock is
+        // the entry point of each continuation, so any instrs put there will
+        // be available to all continuations after the main function is split.
+        CompilerUtils::replaceAllPointerUses(Alloca, GepInst, DeadInstructions);
+
+        // Alloca is dead, we may visit this Row more than once, so we need to
+        // check if the value is in the DeadInstructions list already.
+        if (std::find(DeadInstructions.begin(), DeadInstructions.end(), Alloca) == DeadInstructions.end()) {
+          // Insert the AllocaInst if it's not a duplicate
+          DeadInstructions.push_back(Alloca);
+        }
+
+        continue; // Alloca do not have Spills or Reloads
+      }
+
+      // Fix spill (store) address
+      for (StoreInst *SpillInst : Row.Spills) {
+        auto *SpillBB = SpillInst->getParent();
+
+        // Set insertion point before the SpillInst
+        auto *GepInst =
+            TryReuseGep(SpillBB, SpillInst->getParent()->getFirstInsertionPt(), Twine("frame"), Row.Def->getName());
+
+        // Replace the SpillInst ptr, that is Poison, with the GEP.
+        if (SpillInst->getPointerOperand() == PoisonFrameAddr)
+          SpillInst->setOperand(1, GepInst);
+      }
+
+      // Fix reload (load) address
+      for (LoadInst *ReloadInst : Row.Reloads) {
+        auto *ReloadBB = ReloadInst->getParent();
+
+        // Set insertion point before the ReloadInst
+        auto *GepInst =
+            TryReuseGep(ReloadBB, ReloadInst->getParent()->getFirstInsertionPt(), Twine("frame"), Row.Def->getName());
+
+        // Replace the ReloadInst ptr, that is Poison, with the GEP.
+        if (ReloadInst->getPointerOperand() == PoisonFrameAddr)
+          ReloadInst->setOperand(0, GepInst);
+      }
+    }
+  }
+}
+
+void ContStateBuilderImpl::removeUnusedReloads() {
+  for (auto &Row : FrameTable) {
+    // There should be 1 reload per BB where a reload occurs
+    assert(Row.Reloads.size() == Row.ReloadedOnBB.size());
+
+    SmallVector<LoadInst *, 2> UsedReloads;
+
+    // Identify the used reloads and keep them, remove the unused ones.
+    for (LoadInst *R : Row.Reloads) {
+
+      if (!R->use_empty()) {
+        UsedReloads.push_back(R);
+        continue;
+      }
+
+      assert(R->use_empty() && R->materialized_use_empty());
+
+      // This is an unused reload, remove it.
+      Row.ReloadedOnBB.erase(R->getParent());
+
+      // Remove reload
+      R->eraseFromParent();
+    }
+
+    // Now remove the old reloads list.
+    Row.Reloads = UsedReloads;
+
+    LLVM_DEBUG({
+      for (LoadInst *R : Row.Reloads)
+        assert(!R->use_empty());
+    });
+
+    // There should be 1 reload per BB where a reload occurs
+    assert(Row.Reloads.size() == Row.ReloadedOnBB.size());
+  }
+}
+
+static bool hasPoisonOperand(Instruction *I) {
+  // Check GetElementPtrInst
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
+    for (auto &Op : GEP->operands())
+      if (isa<PoisonValue>(Op))
+        return true;
+  }
+  // Check LoadInst
+  else if (auto *LI = dyn_cast<LoadInst>(I)) {
+    if (isa<PoisonValue>(LI->getPointerOperand()))
+      return true;
+  }
+  // Check StoreInst
+  else if (auto *SI = dyn_cast<StoreInst>(I)) {
+    if (isa<PoisonValue>(SI->getPointerOperand()) || isa<PoisonValue>(SI->getValueOperand()))
+      return true;
+  }
+  // Check PHINode
+  else if (auto *PN = dyn_cast<PHINode>(I)) {
+    for (auto &Op : PN->operands())
+      if (isa<PoisonValue>(Op))
+        return true;
+  }
+
+  return false;
+}
+
+[[maybe_unused]] static void collectInstWithPoison(Function &F, SmallSet<Instruction *, 16> &PoisonInstructions) {
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      // Record the instruction if it has a poison operand
+      if (hasPoisonOperand(&I)) {
+        PoisonInstructions.insert(&I);
+      }
+    }
+  }
+}
+
+[[maybe_unused]] static bool hasNewPoisonOperand(Function &F, const SmallSet<Instruction *, 16> &PoisonInstructions) {
+  bool foundNewPoison = false;
+
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      if (PoisonInstructions.count(&I) > 0)
+        continue;
+
+      // If a new poison operand is found, dump the instruction and set the flag
+      if (hasPoisonOperand(&I)) {
+        errs() << "Found poison operand in instruction: " << I << "\n";
+        foundNewPoison = true;
+      }
+    }
+  }
+
+  return foundNewPoison;
+}
+
+void ContStateBuilderImpl::reportContStateInfo() const {
+  uint64_t TotalReloads = 0;
+  uint64_t TotalSpills = 0;
+  uint64_t TotalGeps = 0;
+
+  for (auto &Row : FrameTable) {
+    if (isa<AllocaInst>(Row.Def))
+      continue;
+
+    TotalGeps += Row.GepInBB.size();
+    TotalReloads += Row.Reloads.size();
+    TotalSpills++;
+
+    for (auto &I : FrameStructs) {
+      auto &Struct = I.second;
+      if (!Struct.CandidateSpills.contains(Row.Def))
+        continue;
+    }
+  }
+
+  // Note, these stats should closely match the stats reported by
+  // reportGepsSpillsAndReloads that counts raw geps, reloads and spills
+  // before and after building the cont state.
+  dbgs() << "Final # of Geps: " << TotalGeps << "\n";
+  dbgs() << "Final # of Reloads: " << TotalReloads << "\n";
+  dbgs() << "Final # of Spills: " << TotalSpills << "\n";
+}
+
+template <typename InstType> static unsigned countInstrs(const Function &F) {
+  unsigned Total = 0;
+  for (auto &BB : F)
+    for (auto &I : BB)
+      if (isa<InstType>(&I))
+        Total++;
+  return Total;
+}
+
+// Report absolute number of new geps, spills and reloads inserted by the
+// continuation state builder.
+static void reportGepsSpillsAndReloads(Function &F, unsigned NonFrameGeps, unsigned NonSpillStores,
+                                       unsigned NonReloadLoads) {
+  if (ReportContStateAccessCounts) {
+    unsigned FrameGeps = countInstrs<GetElementPtrInst>(F);
+    assert(FrameGeps >= NonFrameGeps);
+    FrameGeps -= NonFrameGeps;
+
+    unsigned SpillStores = countInstrs<StoreInst>(F);
+    assert(SpillStores >= NonSpillStores);
+    SpillStores -= NonSpillStores;
+
+    unsigned ReloadLoads = countInstrs<LoadInst>(F);
+    assert(ReloadLoads >= NonReloadLoads);
+    ReloadLoads -= NonReloadLoads;
+
+    auto Stage = lgc::rt::getLgcRtShaderStage(&F);
+    dbgs() << "Continuation state geps of \"" << F.getName() << "\" (" << Stage << "): " << FrameGeps << "\n";
+    dbgs() << "Continuation state reloads of \"" << F.getName() << "\" (" << Stage << "): " << ReloadLoads << "\n";
+    dbgs() << "Continuation state spills of \"" << F.getName() << "\" (" << Stage << "): " << SpillStores << "\n";
+  }
+}
+
+void ContStateBuilderImpl::buildCoroutineFrame() {
+  // This method builds a unique frame for each suspend point. The frame
+  // includes values that are needed for the resume.
+  //
+  // The spills and reloads are inserted with poison addresses. These addresses
+  // are set to real frame addresses after all spills and reloads for all
+  // frames have been identified and inserted. This makes it easier to both
+  // optimize the frame layout and optimize the location of spills and reloads
+  // without worrying about how to get the right frame addresses. Similarly,
+  // the spilled values and the uses of the reloaded values are also set after
+  // all spills and reloads have been inserted. This allows us to use SSA
+  // Updater to build the phi node networks when necessary.
+
+  // ======== Do Rematerializations ========
+
+  LLVM_DEBUG(dbgs() << "Running Rematerialization\n");
+
+  // For default remat we need to do that before spilling
+  SuspendCrossingInfo FullChecker(F, Shape.CoroSuspends, Shape.CoroEnds);
+  coro::doRematerializations(F, FullChecker, IsMaterializable);
+
+  // ======== Initial Load and Store Stats ========
+
+  unsigned NonFrameGeps = countInstrs<GetElementPtrInst>(F);
+  unsigned NonSpillStores = countInstrs<StoreInst>(F);
+  unsigned NonReloadLoads = countInstrs<LoadInst>(F);
+
+  // ======== Create a frame struct per suspend ========
+
+  LLVM_DEBUG(dbgs() << "Running SuspendCrossingInfo Analysis\n");
+
+  for (auto *Suspend : Shape.CoroSuspends) {
+    // Create a frame struct per suspend
+    auto &Struct = FrameStructs[Suspend];
+    Struct.Checker =
+        std::make_unique<SuspendCrossingInfo>(F, SmallVector<AnyCoroSuspendInst *>({Suspend}), Shape.CoroEnds);
+
+    // Normalization already splits the BB around the suspend instructions.
+    BasicBlock *BB = Suspend->getParent();
+    Struct.SuspendBB = BB->getSinglePredecessor();
+    Struct.ResumeBB = BB->getSingleSuccessor();
+  }
+
+  // Renumber the blocks, normalization will have inserted new blocks.
+  F.renumberBlocks();
+  DominatorTree DT(F);
+
+  SmallVector<Instruction *, 4> DeadInstructions;
+  SmallVector<CoroAllocaAllocInst *, 4> LocalAllocas;
+  // Note: CoroAlloca* are used by swift, we don't need to handle them.
+
+  DEBUG_DUMP_CFG(F, "pre-frame-build-cfg");
+
+  // ======== Gather candidate spills and allocas ========
+
+  LLVM_DEBUG(dbgs() << "Gathering Spills and Allocas\n");
+
+  for (auto *Suspend : Shape.CoroSuspends) {
+    // Create a frame struct per suspend
+    auto &Struct = FrameStructs[Suspend];
+
+    assert(Struct.CandidateSpills.empty());
+    assert(Struct.CandidateAllocas.empty());
+
+    // Collect the candidate spills for arguments and other not-materializable
+    // values for this suspend.
+    coro::collectSpillsFromArgs(Struct.CandidateSpills, F, *Struct.Checker);
+    coro::collectSpillsAndAllocasFromInsts(Struct.CandidateSpills, Struct.CandidateAllocas, DeadInstructions,
+                                           LocalAllocas, F, *Struct.Checker, DT, Shape);
+  }
+
+  // ======== Frame Layout ========
+
+  auto Id = Shape.getRetconCoroId();
+  auto RetconSize = Id->getStorageSize();
+  auto RetconAlign = Id->getStorageAlignment();
+
+  LLVM_DEBUG({
+    dbgs() << "----- Frame Data At Each Suspend -----\n";
+    auto Stage = lgc::rt::getLgcRtShaderStage(&F);
+    dbgs() << "Function: " << F.getName() << " (" << Stage << ")\n";
+    dbgs() << "Total # of Suspends: " << FrameStructs.size() << "\n";
+  });
+
+  for (auto R : llvm::enumerate(FrameStructs)) {
+    auto *Suspend = R.value().first;
+    auto &Struct = R.value().second;
+    LLVM_DEBUG(dbgs() << "Suspend " << R.index() << "\n");
+    LLVM_DEBUG(dbgs() << "\tSuspendInst: "; Suspend->dump());
+    LLVM_DEBUG(dbgs() << "\tSuspendBB: %" << getLabel(Suspend->getParent()) << "\n");
+
+    // Sink spill uses. This will move all uses of allocas to after the
+    // CoroBegin ensuring that all access to the alloca ptr occur after
+    // the Coro frame ptr has been malloced by the user code. This simplifies
+    // handling alloca because it means we can simply replace the alloca with
+    // space on the frame. So there are two cases: the alloca does not cross a
+    // suspend so we leave it alone, or the alloca crosses a suspend so we put
+    // it into the coroutine frame.
+    coro::sinkSpillUsesAfterCoroBegin(DT, Shape.CoroBegin, Struct.CandidateSpills, Struct.CandidateAllocas);
+
+    // Go through candidate list and add values that are needed for this
+    // suspend. Note: the offset into the frame is not yet finalized.
+    addValuesToFrameTable(Suspend, Struct.CandidateSpills, Struct.CandidateAllocas);
+
+    // Initialize the fields with pre-existing offsets.
+    initFrameStructLayout(Suspend, Struct);
+
+    // Compute greedy struct layout of alloca. This places alloca first in the
+    // frame struct, before non-alloca values.
+    computeFrameStructLayoutGreedy(Suspend, Struct, /*IsAlloca*/ true);
+
+    // Next, compute greedy struct layout of non-alloca.
+    computeFrameStructLayoutGreedy(Suspend, Struct, /*IsAlloca*/ false);
+
+    // Sorting fields by offset and determine the total frame size required.
+    finalizeFrameStructLayout(Struct);
+
+    LLVM_DEBUG({
+      dbgs() << "\tFrame Size Bytes: " << Struct.Size << "\n";
+      dbgs() << "\tFrame Align Bytes: " << Struct.Alignment.value() << "\n";
+    });
+  }
+
+  // Create the Shape.FrameTy, the maximum of the frame sizes computed above
+  Shape.FrameTy = createFrameTy();
+
+  // CoroSplit will replace any uses of CoroBegin with an alloca (or similar).
+  // So where we need the frame ptr we just use CoroBegin.
+  Shape.FramePtr = Shape.CoroBegin;
+
+  // IsFrameInlineInStorage determines if split coroutines will malloc a new
+  // frame. Typically this is done because the default frame provided by
+  // coro.id is not large enough. That would be done with this logic:
+  Shape.RetconLowering.IsFrameInlineInStorage = (MaxFrameSize <= RetconSize && MaxFrameAlign <= RetconAlign);
+  // However, we may elict to never use the inline storage to avoid the special
+  // cases it requires.
+
+  // ======== Poison instructions ========
+
+  // Record instructions with poison so we can ignore them later when checking
+  // for incorrectly generated instructions.
+#ifndef NDEBUG
+  SmallSet<Instruction *, 16> PoisonInstructions;
+  collectInstWithPoison(F, PoisonInstructions);
+#endif
+
+  // ======== Insert Reloads ========
+
+  LLVM_DEBUG(dbgs() << "Inserting Reloads\n");
+
+  // Insert reloads before spills because inserting reloads loops over uses.
+  // Spills (inserted below) also count as a use so if we insert spills
+  // before reloads then that would add more uses, but we should not insert
+  // a reload before a spill. So we insert reloads first.
+  insertReloads();
+
+  // ======== Insert Spills ========
+  // Spills are done after reloads so we can try to insert spills after
+  // last-uses (reloads) when eviction is enabled.
+
+  LLVM_DEBUG(dbgs() << "Inserting Spills\n");
+
+  insertSpills(Shape, DT);
+
+  // ======== Complete Accesses To the Frame Structs ========
+
+  LLVM_DEBUG(dbgs() << "Building Phi Node Networks\n");
+
+  // With all spills and reloads in-place now we can generate the phi network
+  // that carries the values between defs and uses.
+  buildPhiNetwork();
+
+  LLVM_DEBUG(dbgs() << "Removing unused reloads\n");
+
+  // A value may cross multiple suspends but not be used between the suspends.
+  // Now that the phi node networks have been built we can remove reloads that
+  // did not end up having any uses.
+  removeUnusedReloads();
+
+  LLVM_DEBUG(dbgs() << "Creating GEPs\n");
+
+  // Build GEPs to complete the access to the frame structs. Replace poisoned
+  // frame address ptrs with computed values. Also replace allocas with frame
+  // address ptrs.
+  createFrameGEPs(DeadInstructions);
+
+  LLVM_DEBUG(dbgs() << "Final Frame Size Bytes: " << MaxFrameSize << "\n");
+  LLVM_DEBUG(dbgs() << "Final Frame Align Bytes: " << MaxFrameAlign.value() << "\n");
+
+  LLVM_DEBUG(reportContStateInfo());
+
+  LLVM_DEBUG({
+    dbgs() << "-- FrameStructs --\n";
+    unsigned Idx = 0;
+    for (auto &I : FrameStructs) {
+      auto *Suspend = I.first;
+      auto &Struct = I.second;
+      dbgs() << "Suspend " << Idx++ << "\n";
+      dbgs() << "\tSuspend: ";
+      Suspend->dump();
+      Struct.dump(FrameTable);
+    }
+  });
+
+  LLVM_DEBUG({
+    dbgs() << "-- FrameTable --\n";
+    unsigned Idx = 0;
+    for (auto &Row : FrameTable) {
+      dbgs() << "Row " << Idx++ << "\n";
+      Row.dump();
+    }
+  });
+
+  // ======== Poison instructions ========
+#ifndef NDEBUG
+  // Verify no new poisons are left in the IR
+  if (hasNewPoisonOperand(F, PoisonInstructions)) {
+    llvm_unreachable("Error: Found poison");
+  }
+#endif
+
+  // Remove dead instrs
+  for (auto *I : DeadInstructions)
+    I->eraseFromParent();
+
+  // Info is printed if non-debug mode for stats collection & reporting.
+  reportGepsSpillsAndReloads(F, NonFrameGeps, NonSpillStores, NonReloadLoads);
+
+  DEBUG_DUMP_CFG(F, "post-frame-build-cfg");
+  LLVM_DEBUG(dbgs() << "-- After buildCoroutineFrame, Before splitCoroutine --\n"; F.dump());
+}
+
+} // namespace
+
+ContStateBuilder::ContStateBuilder(Function &F, coro::Shape &S, std::function<bool(Instruction &I)> IsMaterializable)
+    : coro::AnyRetconABI(F, S, IsMaterializable) {
+}
+
+// Allocate the coroutine frame and do spill/reload as needed.
+void ContStateBuilder::buildCoroutineFrame(bool OptimizeFrame) {
+#ifndef NDEBUG
+  if (UseLLVMContStateBuilder) {
+    AnyRetconABI::buildCoroutineFrame(OptimizeFrame);
+    return;
+  }
+#endif
+
+  ContStateBuilderImpl Impl(F, Shape, IsMaterializable);
+
+  Impl.buildCoroutineFrame();
+}
+
+#undef DEBUG_TYPE
diff --git a/llvmraytracing/lib/ContStateBuilder.h b/llvmraytracing/lib/ContStateBuilder.h
new file mode 100644
index 0000000000..04bf05f62f
--- /dev/null
+++ b/llvmraytracing/lib/ContStateBuilder.h
@@ -0,0 +1,45 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+//===- ContStateBuilder.h - A custom ABI for LLVM Coroutines---------------===//
+// This file defines Continuations Passing Style Return-Continuation ABI for
+// LLVM coroutine transforms that is used to build the cont state buffer.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "llvm/Transforms/Coroutines/ABI.h"
+
+namespace llvmraytracing {
+
+class ContStateBuilder : public llvm::coro::AnyRetconABI {
+public:
+  ContStateBuilder(llvm::Function &F, llvm::coro::Shape &S, std::function<bool(llvm::Instruction &I)> IsMaterializable);
+
+  // Allocate the coroutine frame and do spill/reload as needed.
+  void buildCoroutineFrame(bool OptimizeFrame) override;
+};
+
+} // namespace llvmraytracing
diff --git a/llvmraytracing/lib/Continuations.cpp b/llvmraytracing/lib/Continuations.cpp
index 2116d4fcea..5348a8618f 100644
--- a/llvmraytracing/lib/Continuations.cpp
+++ b/llvmraytracing/lib/Continuations.cpp
@@ -30,6 +30,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvmraytracing/Continuations.h"
+#include "ContStateBuilder.h"
 #include "compilerutils/CompilerUtils.h"
 #include "compilerutils/DxilToLlvm.h"
 #include "llvmraytracing/ContinuationsUtil.h"
@@ -57,6 +58,7 @@
 #include "llvm/Transforms/Coroutines/CoroEarly.h"
 #include "llvm/Transforms/Coroutines/CoroElide.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/Transforms/InstCombine/InstCombine.h"
 #include "llvm/Transforms/Scalar/ADCE.h"
 #include "llvm/Transforms/Scalar/InstSimplifyPass.h"
 #include "llvm/Transforms/Scalar/SROA.h"
@@ -580,6 +582,9 @@ void ContHelper::addDxilGpurtLibraryPasses(ModulePassManager &MPM) {
   FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
   FPM.addPass(InstSimplifyPass());
   FPM.addPass(SimplifyCFGPass());
+  // Intentionally do another round of InstSimplify+SimplifyCFG to ensure traits in Gpurt are fully optimized out
+  FPM.addPass(InstSimplifyPass());
+  FPM.addPass(SimplifyCFGPass());
   FPM.addPass(ADCEPass());
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
 }
@@ -852,7 +857,10 @@ Value *llvm::replaceIntrinsicCall(IRBuilder<> &B, Type *SystemDataTy, Value *Sys
     FakeUse = cast<FreezeInst>(B.CreateFreeze(Replacement));
   }
   Inliner.inlineCall(*NewCall);
-  B.SetInsertPoint(&*B.GetInsertPoint());
+  auto *OldInsertPt = &*B.GetInsertPoint();
+  // If insert point happens to be `Call`, move it to the next instruction
+  B.SetInsertPoint(OldInsertPt == Call ? Call->getNextNode() : OldInsertPt);
+
   Call->eraseFromParent();
   // Inlined, so original replacement is now invalid
   Replacement = nullptr;
@@ -875,23 +883,28 @@ static bool replaceEnqueueIntrinsic(Function &F) {
     B.SetInsertPoint(&CInst);
     CallInst *NewCall = nullptr;
     Value *WaitMask = nullptr;
+    Value *ShaderRecIdx = nullptr;
     Value *RetAddr = nullptr;
     if (IsWaitEnqueue) {
       // Handle WaitEnqueue.
       WaitMask = CInst.getArgOperand(1);
-      RetAddr = CInst.getArgOperand(2);
+      ShaderRecIdx = CInst.getArgOperand(2);
+      RetAddr = CInst.getArgOperand(3);
     } else {
-      RetAddr = CInst.getArgOperand(1);
+      ShaderRecIdx = CInst.getArgOperand(1);
+      RetAddr = CInst.getArgOperand(2);
     }
 
     SmallVector<Value *> TailArgs;
-    TailArgs.append(CInst.arg_begin() + (WaitMask ? 3 : 2), CInst.arg_end());
+    const uint32_t TailArgStartIdx = WaitMask ? 4 : 3;
+    TailArgs.append(CInst.arg_begin() + TailArgStartIdx, CInst.arg_end());
 
     // For DX, these arguments are unused right now and are just here to fulfill the `JumpOp`s requirements as being
     // defined in the LgcCpsDialect.
     const uint32_t DummyLevelsArg = -1;
     Value *DummyCsp = PoisonValue::get(B.getInt32Ty());
-    NewCall = B.create<lgc::cps::JumpOp>(CInst.getArgOperand(0), DummyLevelsArg, DummyCsp, RetAddr, TailArgs);
+    NewCall =
+        B.create<lgc::cps::JumpOp>(CInst.getArgOperand(0), DummyLevelsArg, DummyCsp, ShaderRecIdx, RetAddr, TailArgs);
 
     if (WaitMask) {
       // The only supported wait mask is a constant -1. We don't enforce having a constant here because the SPIR-V
@@ -967,21 +980,15 @@ static void handleGetRtip(Function &Func, uint32_t RtipLevel) {
          && Func.getFunctionType()->getReturnType()->isIntegerTy(32));
 
   auto *RtipConst = ConstantInt::get(IntegerType::get(Func.getContext(), 32), RtipLevel);
-  for (auto &Use : make_early_inc_range(Func.uses())) {
-    if (auto *CInst = dyn_cast<CallInst>(Use.getUser())) {
-      if (CInst->isCallee(&Use)) {
-        CInst->replaceAllUsesWith(RtipConst);
-        CInst->eraseFromParent();
-      }
-    }
-  }
+  llvm::replaceCallsToFunction(Func, *RtipConst);
 }
 
 static void handleGetUninitialized(Function &Func) {
   auto *ArgTy = Func.getReturnType();
   auto *Poison = PoisonValue::get(ArgTy);
+  IRBuilder<> B{Func.getContext()};
   llvm::forEachCall(Func, [&](llvm::CallInst &CInst) {
-    IRBuilder<> B(&CInst);
+    B.SetInsertPoint(&CInst);
     // Create a frozen poison value so poison doesn't propagate into
     // dependent values, e.g. when bitpacking the uninitialized value into
     // a bitfield that should not be invalidated.
@@ -1163,14 +1170,22 @@ uint64_t llvm::computePayloadSpillSize(uint64_t NumI32s, uint64_t NumReservedReg
   return NumStackI32s * RegisterBytes;
 }
 
-namespace llvm {
-namespace coro {
-bool defaultMaterializable(Instruction &V);
-} // End namespace coro
-} // End namespace llvm
+DXILCoroSplitPass::DXILCoroSplitPass()
+    : CoroSplitPass(std::function<bool(Instruction &)>(&DXILMaterializable), {[](Function &F, coro::Shape &S) {
+                      return std::make_unique<llvmraytracing::ContStateBuilder>(F, S, DXILMaterializable);
+                    }},
+                    /*OptimizeFrame*/ true) {
+}
+
+LgcCoroSplitPass::LgcCoroSplitPass()
+    : CoroSplitPass(std::function<bool(Instruction &)>(&LgcMaterializable), {[](Function &F, coro::Shape &S) {
+                      return std::make_unique<llvmraytracing::ContStateBuilder>(F, S, LgcMaterializable);
+                    }},
+                    /*OptimizeFrame*/ true) {
+}
 
 bool llvm::commonMaterializable(Instruction &Inst) {
-  if (coro::defaultMaterializable(Inst))
+  if (coro::isTriviallyMaterializable(Inst))
     return true;
 
   // Insert into constant.
@@ -1237,7 +1252,7 @@ bool llvm::LgcMaterializable(Instruction &OrigI) {
       // FIXME: switch to dialectOp check.
       if (CalledName.starts_with("lgc.user.data") || CalledName.starts_with("lgc.shader.input") ||
           CalledName.starts_with("lgc.create.get.desc.ptr") || CalledName.starts_with("lgc.load.buffer.desc") ||
-          CalledName.starts_with("lgc.load.user.data"))
+          CalledName.starts_with("lgc.load.strided.buffer.desc") || CalledName.starts_with("lgc.load.user.data"))
         return true;
     }
   }
@@ -1313,6 +1328,8 @@ void addLgcContinuationTransform(ModulePassManager &MPM) {
 
   MPM.addPass(LowerAwaitPass());
 
+  MPM.addPass(createModuleToFunctionPassAdaptor(InstCombinePass()));
+
   MPM.addPass(CoroEarlyPass());
   CGSCCPassManager CGPM;
   CGPM.addPass(LgcCoroSplitPass());
@@ -1325,6 +1342,7 @@ void addLgcContinuationTransform(ModulePassManager &MPM) {
 #ifndef NDEBUG
   MPM.addPass(ContinuationsLintPass());
 #endif
+  MPM.addPass(ContinuationsStatsReportPass());
 
   MPM.addPass(createModuleToFunctionPassAdaptor(LowerSwitchPass()));
   MPM.addPass(createModuleToFunctionPassAdaptor(FixIrreduciblePass()));
diff --git a/llvmraytracing/lib/ContinuationsLint.cpp b/llvmraytracing/lib/ContinuationsLint.cpp
index cce8230015..55852e2aa1 100644
--- a/llvmraytracing/lib/ContinuationsLint.cpp
+++ b/llvmraytracing/lib/ContinuationsLint.cpp
@@ -31,6 +31,7 @@
 
 #include "llvmraytracing/Continuations.h"
 #include "lgc/LgcCpsDialect.h"
+#include "lgc/LgcIlCpsDialect.h"
 #include "llvm-dialects/Dialect/Visitor.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/IR/Analysis.h"
@@ -147,21 +148,37 @@ void ContinuationsLintPassImpl::checkJumpTargets() {
 
 // Check that every function has at most one setLocalRootIndex call.
 void ContinuationsLintPassImpl::checkSetLocalRootIndex() {
-  if (auto *SetF = Mod.getFunction("amd.dx.setLocalRootIndex")) {
+  struct VisitorState {
+    const AwaitFuncSetTy &FuncsWithAwaits;
     SmallDenseSet<Function *> HasSetF;
+    SmallVector<Function *> InvalidFuncs;
+  };
 
-    llvm::forEachCall(*SetF, [&](CallInst &CInst) {
-      // Returns true if it is a new value
-      Function *Func = CInst.getFunction();
-      // It is allowed to have multiple setLocalRootIndex calls if the call resides in a function that was not yet
-      // split.
-      if (FuncsWithAwaits.contains(Func))
-        return;
-
-      auto Inserted = HasSetF.insert(Func);
-      Check(Inserted.second, "Found a function with more than one call to setLocalRootIndex", Func);
-    });
-  }
+  static const auto Visitor =
+      llvm_dialects::VisitorBuilder<VisitorState>()
+          .add<lgc::ilcps::SetLocalRootIndexOp>([](VisitorState &S, lgc::ilcps::SetLocalRootIndexOp &Op) {
+            // Collect all functions that have more than one call to lgc.ilcps.setLocalRootIndex, but only if these
+            // calls do not reside in functions that are not yet split.
+
+            // Returns true if it is a new value
+            Function *Func = Op.getFunction();
+            // It is allowed to have multiple setLocalRootIndex calls if the call resides in a function that was not yet
+            // split.
+            if (S.FuncsWithAwaits.contains(Func))
+              return;
+
+            auto Inserted = S.HasSetF.insert(Func);
+            if (!Inserted.second)
+              S.InvalidFuncs.push_back(Func);
+          })
+          .build();
+
+  VisitorState State{FuncsWithAwaits, {}, {}};
+
+  Visitor.visit(State, Mod);
+
+  for (auto *Func : State.InvalidFuncs)
+    checkFailed("Found a function with more than one call to setLocalRootIndex", Func);
 }
 
 PreservedAnalyses ContinuationsLintPass::run(Module &Mod, ModuleAnalysisManager &AnalysisManager) {
diff --git a/llvmraytracing/lib/ContinuationsStatsReport.cpp b/llvmraytracing/lib/ContinuationsStatsReport.cpp
index 5c02a19c67..2f046b8c13 100644
--- a/llvmraytracing/lib/ContinuationsStatsReport.cpp
+++ b/llvmraytracing/lib/ContinuationsStatsReport.cpp
@@ -126,7 +126,6 @@ void ContinuationsStatsReportPassImpl::collectProcessableFunctions() {
     if (!Stage || Stage == RayTracingShaderStage::KernelEntry)
       continue;
 
-    const uint32_t SystemDataArgumentIndex = lgc::cps::isCpsFunction(F) ? CpsArgIdx::SystemData : 2;
     switch (Stage.value()) {
     case RayTracingShaderStage::RayGeneration:
     case RayTracingShaderStage::Intersection:
@@ -137,7 +136,7 @@ void ContinuationsStatsReportPassImpl::collectProcessableFunctions() {
     case RayTracingShaderStage::Traversal: {
       FunctionData Data;
       Data.Stage = Stage;
-      Data.SystemDataTy = F.getFunctionType()->getParamType(SystemDataArgumentIndex);
+      Data.SystemDataTy = F.getFunctionType()->getParamType(CpsArgIdxWithStackPtr::SystemData);
       assert(Data.SystemDataTy->isStructTy() && "SystemData should be of struct type!");
 
       [[maybe_unused]] bool DidInsert = ToProcess.insert({&F, std::move(Data)}).second;
diff --git a/llvmraytracing/lib/CpsStackLowering.cpp b/llvmraytracing/lib/CpsStackLowering.cpp
index d189770d7b..caec0d870d 100644
--- a/llvmraytracing/lib/CpsStackLowering.cpp
+++ b/llvmraytracing/lib/CpsStackLowering.cpp
@@ -32,7 +32,6 @@
 #include "lgc/LgcRtDialect.h"
 #include "llvm-dialects/Dialect/Visitor.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Type.h"
 
@@ -107,7 +106,7 @@ void CpsStackLowering::visitGetElementPtr(GetElementPtrInst &GEP) {
   if (GEP.getAddressSpace() != lgc::cps::stackAddrSpace)
     return;
 
-  IRBuilder<> Builder(&GEP);
+  Builder.SetInsertPoint(&GEP);
 
   auto Values = TypeLower.getValue(GEP.getPointerOperand());
   Value *AddChain = Values[0];
@@ -151,10 +150,10 @@ void CpsStackLowering::visitLoad(LoadInst &Load) {
 
   auto Values = TypeLower.getValue(Load.getPointerOperand());
 
-  IRBuilder<> Builder(&Load);
-  Values[0] = getRealMemoryAddress(Builder, Values[0]);
+  Builder.SetInsertPoint(&Load);
+  Values[0] = getRealMemoryAddress(Values[0]);
 
-  Values[0] = Builder.CreateBitCast(Values[0], Load.getType()->getPointerTo(getLoweredCpsStackAddrSpace()));
+  Values[0] = Builder.CreateBitCast(Values[0], Builder.getPtrTy(getLoweredCpsStackAddrSpace()));
 
   Load.replaceUsesOfWith(Load.getPointerOperand(), Values[0]);
 }
@@ -169,11 +168,10 @@ void CpsStackLowering::visitStore(llvm::StoreInst &Store) {
 
   auto Values = TypeLower.getValue(Store.getPointerOperand());
 
-  IRBuilder<> Builder(&Store);
-  Values[0] = getRealMemoryAddress(Builder, Values[0]);
+  Builder.SetInsertPoint(&Store);
+  Values[0] = getRealMemoryAddress(Values[0]);
 
-  Values[0] =
-      Builder.CreateBitCast(Values[0], Store.getValueOperand()->getType()->getPointerTo(getLoweredCpsStackAddrSpace()));
+  Values[0] = Builder.CreateBitCast(Values[0], Builder.getPtrTy(getLoweredCpsStackAddrSpace()));
 
   Store.replaceUsesOfWith(Store.getPointerOperand(), Values[0]);
 }
@@ -184,7 +182,7 @@ void CpsStackLowering::visitStore(llvm::StoreInst &Store) {
 // @param JumpOp: the instruction
 void CpsStackLowering::visitJump(lgc::cps::JumpOp &JumpOp) {
   Builder.SetInsertPoint(&JumpOp);
-  Value *CSP = loadCsp(Builder);
+  Value *CSP = loadCsp();
 
   // Update previously lowered arguments
   SmallVector<Value *> TailArgs{JumpOp.getTail()};
@@ -205,8 +203,8 @@ void CpsStackLowering::visitJump(lgc::cps::JumpOp &JumpOp) {
 //
 // @param Continue: the instruction
 void CpsStackLowering::visitContinue(lgc::ilcps::ContinueOp &Continue) {
-  IRBuilder<> Builder(&Continue);
-  Continue.setCsp(loadCsp(Builder));
+  Builder.SetInsertPoint(&Continue);
+  Continue.setCsp(loadCsp());
 }
 
 // =====================================================================================================================
@@ -214,8 +212,8 @@ void CpsStackLowering::visitContinue(lgc::ilcps::ContinueOp &Continue) {
 //
 // @param WaitContinue: the instruction
 void CpsStackLowering::visitWaitContinue(lgc::ilcps::WaitContinueOp &WaitContinue) {
-  IRBuilder<> Builder(&WaitContinue);
-  WaitContinue.setCsp(loadCsp(Builder));
+  Builder.SetInsertPoint(&WaitContinue);
+  WaitContinue.setCsp(loadCsp());
 }
 
 // =====================================================================================================================
@@ -264,14 +262,14 @@ void CpsStackLowering::visitBitCastInst(llvm::BitCastInst &BC) {
 //
 // @param AllocOp: the instruction
 void CpsStackLowering::visitCpsAlloc(lgc::cps::AllocOp &AllocOp) {
-  IRBuilder<> Builder(&AllocOp);
+  Builder.SetInsertPoint(&AllocOp);
   Value *Size = AllocOp.getSize();
 
   if (Instruction *Inst = dyn_cast<Instruction>(Size))
     if (auto *NewSize = llvm::simplifyInstruction(Inst, *SQ))
       Size = NewSize;
 
-  Value *CSP = loadCsp(Builder);
+  Value *CSP = loadCsp();
 
   // align Size to ContinuationStackAlignment
   ConstantInt *Const = cast<ConstantInt>(Size);
@@ -295,14 +293,14 @@ void CpsStackLowering::visitCpsAlloc(lgc::cps::AllocOp &AllocOp) {
 //
 // @param FreeOp: the instruction
 void CpsStackLowering::visitCpsFree(lgc::cps::FreeOp &FreeOp) {
-  IRBuilder<> Builder(&FreeOp);
+  Builder.SetInsertPoint(&FreeOp);
   Value *Size = FreeOp.getSize();
 
   if (Instruction *Inst = dyn_cast<Instruction>(Size))
     if (auto *NewSize = llvm::simplifyInstruction(Inst, *SQ))
       Size = NewSize;
 
-  Value *CSP = loadCsp(Builder);
+  Value *CSP = loadCsp();
 
   // align Size to ContinuationStackAlignment and subtract from CSP
   ConstantInt *Const = cast<ConstantInt>(Size);
@@ -324,9 +322,9 @@ void CpsStackLowering::visitCpsFree(lgc::cps::FreeOp &FreeOp) {
 //
 // @param PeekOp: the instruction
 void CpsStackLowering::visitCpsPeek(lgc::cps::PeekOp &PeekOp) {
-  IRBuilder<> Builder(&PeekOp);
+  Builder.SetInsertPoint(&PeekOp);
 
-  auto *Ptr = loadCsp(Builder);
+  auto *Ptr = loadCsp();
   auto *Size = PeekOp.getSize();
 
   int ImmSize = cast<ConstantInt>(Size)->getSExtValue();
@@ -346,10 +344,10 @@ void CpsStackLowering::visitCpsPeek(lgc::cps::PeekOp &PeekOp) {
 void CpsStackLowering::visitSetVsp(lgc::cps::SetVspOp &SetVsp) {
   auto *Ptr = SetVsp.getPtr();
 
-  IRBuilder<> B(&SetVsp);
+  Builder.SetInsertPoint(&SetVsp);
 
   auto Values = TypeLower.getValue(Ptr);
-  B.CreateStore(Values[0], CpsStackAlloca);
+  Builder.CreateStore(Values[0], CpsStackAlloca);
   TypeLower.replaceInstruction(&SetVsp, {});
 }
 
@@ -358,8 +356,8 @@ void CpsStackLowering::visitSetVsp(lgc::cps::SetVspOp &SetVsp) {
 //
 // @param GetVsp: the instruction
 void CpsStackLowering::visitGetVsp(lgc::cps::GetVspOp &GetVsp) {
-  IRBuilder<> B(&GetVsp);
-  TypeLower.replaceInstruction(&GetVsp, {loadCsp(B)});
+  Builder.SetInsertPoint(&GetVsp);
+  TypeLower.replaceInstruction(&GetVsp, {loadCsp()});
 }
 
 // =====================================================================================================================
@@ -371,7 +369,7 @@ void CpsStackLowering::visitGetVsp(lgc::cps::GetVspOp &GetVsp) {
 // @param Offset: The offset to the base address, given as integer with bitwidth
 // <= 32.
 //
-Value *CpsStackLowering::getRealMemoryAddress(IRBuilder<> &Builder, Value *Offset) {
+Value *CpsStackLowering::getRealMemoryAddress(Value *Offset) {
   // Since we are using at most 32-bit offsets, assert that we don't put in any
   // offset larger 32 bit.
   assert(Offset->getType()->isIntegerTy() && Offset->getType()->getIntegerBitWidth() <= 32);
@@ -382,13 +380,12 @@ Value *CpsStackLowering::getRealMemoryAddress(IRBuilder<> &Builder, Value *Offse
   Value *GepBase = BasePointer;
   Value *GepIndex = Offset;
 
-  Type *I8 = Builder.getInt8Ty();
   if (isa<ConstantPointerNull>(BasePointer)) {
-    GepBase = Builder.CreateIntToPtr(Offset, I8->getPointerTo(getLoweredCpsStackAddrSpace()));
+    GepBase = Builder.CreateIntToPtr(Offset, Builder.getPtrTy(getLoweredCpsStackAddrSpace()));
     GepIndex = Builder.getInt32(0);
   }
 
-  return Builder.CreateGEP(I8, GepBase, {GepIndex});
+  return Builder.CreateGEP(Builder.getInt8Ty(), GepBase, {GepIndex});
 }
 
 // =====================================================================================================================
@@ -402,7 +399,6 @@ Function *CpsStackLowering::addOrInitCsp(Function *F, Function *GetGlobalMemBase
   CompilerUtils::CrossModuleInliner CrossInliner;
   auto &GpurtContext = lgc::GpurtContext::get(Mod->getContext());
   auto &GpurtLibrary = GpurtContext.theModule ? *GpurtContext.theModule : *Mod;
-  IRBuilder<> Builder(F->getContext());
   Value *Initializer = nullptr;
 
   Builder.SetInsertPointPastAllocas(F);
@@ -448,13 +444,13 @@ Function *CpsStackLowering::addOrInitCsp(Function *F, Function *GetGlobalMemBase
   // Get the global memory base address.
   if (GetGlobalMemBase) {
     auto *Base = CrossInliner.inlineCall(Builder, GetGlobalMemBase).returnValue;
-    auto *CspTy = Builder.getInt8Ty()->getPointerTo(getLoweredCpsStackAddrSpace());
+    auto *CspTy = Builder.getPtrTy(getLoweredCpsStackAddrSpace());
     setRealBasePointer(Builder.CreateIntToPtr(Base, CspTy));
   }
 
   return F;
 }
 
-Value *CpsStackLowering::loadCsp(IRBuilder<> &Builder) {
+Value *CpsStackLowering::loadCsp() {
   return Builder.CreateLoad(CpsStackAlloca->getAllocatedType(), CpsStackAlloca);
 }
diff --git a/llvmraytracing/lib/DXILContLgcRtOpConverter.cpp b/llvmraytracing/lib/DXILContLgcRtOpConverter.cpp
index 71bcb22e9b..5b3d5ad9c9 100644
--- a/llvmraytracing/lib/DXILContLgcRtOpConverter.cpp
+++ b/llvmraytracing/lib/DXILContLgcRtOpConverter.cpp
@@ -31,6 +31,7 @@
 
 #include "llvmraytracing/Continuations.h"
 #include "llvmraytracing/ContinuationsUtil.h"
+#include "lgc/LgcIlCpsDialect.h"
 #include "lgc/LgcRtDialect.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -412,8 +413,7 @@ bool DXILContLgcRtOpConverterPass::convertDxOp(Function &Func) {
 void DXILContLgcRtOpConverterPass::setupLocalRootIndex(Function *F) {
   Builder->SetInsertPointPastAllocas(F);
   auto *LocalIndex = Builder->create<lgc::rt::ShaderIndexOp>();
-  auto *SetLocalRootIndex = llvm::getSetLocalRootIndex(*F->getParent());
-  Builder->CreateCall(SetLocalRootIndex, LocalIndex);
+  Builder->create<lgc::ilcps::SetLocalRootIndexOp>(LocalIndex);
 }
 
 // Do preparation transformations to entry-point shaders.
diff --git a/llvmraytracing/lib/DXILContPostProcess.cpp b/llvmraytracing/lib/DXILContPostProcess.cpp
index 2320f2e32b..5409bbe07a 100644
--- a/llvmraytracing/lib/DXILContPostProcess.cpp
+++ b/llvmraytracing/lib/DXILContPostProcess.cpp
@@ -23,37 +23,28 @@
  *
  **********************************************************************************************************************/
 
-//===- DXILContPostProcess.cpp - Replace intrinsic calls ------------------===//
+//===- DXILContPostProcess.cpp - Finalize IR ------------------===//
 //
-//  * Insert the initialization of the continuation stack pointer.
-//  * Replace dx.op intrinsic calls with calls to the driver implementation
-//    and initialize the system data.
-//  * Wraps all uses of function pointers into an intrinsic that adds
-//    metadata (e.g. VGPR counts) to the function pointer.
+//  * Unpack 32-bit to 64-bit jump addresses
+//  * Translate lgc.cps.jumps to lgc.ilcps.continue / waitContinue calls
+//  * Cleanup unused metadata
 //
 //===----------------------------------------------------------------------===//
 
-#include "compilerutils/CompilerUtils.h"
-#include "llpc/GpurtEnums.h"
 #include "llvmraytracing/Continuations.h"
 #include "llvmraytracing/ContinuationsUtil.h"
-#include "llvmraytracing/CpsStackLowering.h"
 #include "llvmraytracing/GpurtContext.h"
 #include "lgc/LgcCpsDialect.h"
 #include "lgc/LgcIlCpsDialect.h"
 #include "lgc/LgcRtDialect.h"
 #include "llvm-dialects/Dialect/Builder.h"
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm-dialects/Dialect/Visitor.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Analysis.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
 #include <cassert>
@@ -65,211 +56,29 @@ using namespace llvm;
 namespace {
 class DXILContPostProcessPassImpl final {
 public:
-  DXILContPostProcessPassImpl(Module &M, Module &GpurtLibrary);
+  DXILContPostProcessPassImpl(Module &M);
   PreservedAnalyses run(ModuleAnalysisManager &AnalysisManager);
 
-  static constexpr unsigned SystemDataArgumentIndex = 2;
-  struct FunctionData {
-    DXILShaderKind Kind = DXILShaderKind::Invalid;
-    /// Calls to hlsl intrinsics
-    SmallVector<CallInst *> IntrinsicCalls;
-
-    /// If this is the start function part of a split function
-    bool IsStart = true;
-    Type *SystemDataTy = nullptr;
-    unsigned SystemDataArgumentIndex = std::numeric_limits<unsigned>::max();
-  };
-
 private:
-  void initializeProcessableFunctionData();
-  bool lowerCpsOps();
   Value *ensure64BitAddr(Value *Packed32BitAddr);
   void lowerJumpOp(lgc::cps::JumpOp &JumpOp);
-  void lowerAsContinuationReferenceOp(lgc::cps::AsContinuationReferenceOp &AsCrOp);
+  void lowerAsContinuationReferenceOp(lgc::cps::AsContinuationReferenceOp &AsCrOp, Function *GetContinuationAddrAndMD);
   bool cleanupIncomingPayloadMetadata(Function &F);
-  bool cleanupOutgoingPayloadMetadata();
 
   Module *Mod;
-  Module *GpurtLibrary;
-  MapVector<Function *, FunctionData> ToProcess;
   llvm_dialects::Builder Builder;
-  std::optional<ContStackAddrspace> StackAddrspace;
-  CompilerUtils::CrossModuleInliner CrossInliner;
 };
 
-// Removes outgoing payload metadata
-bool DXILContPostProcessPassImpl::cleanupOutgoingPayloadMetadata() {
-  struct State {
-    bool Changed = false;
-  };
-
-  static const auto Visitor =
-      llvm_dialects::VisitorBuilder<State>()
-          .addSet<lgc::ilcps::ContinueOp, lgc::ilcps::WaitContinueOp>([](State &State, Instruction &Op) {
-            ContHelper::OutgoingRegisterCount::reset(&Op);
-            ContHelper::ReturnedRegisterCount::reset(&Op);
-            State.Changed = true;
-          })
-          .build();
-
-  State S;
-  Visitor.visit(S, *Mod);
-
-  return S.Changed;
-}
-
-static Function *getContinuationGetAddrAndMD(Module &M, Type *RetTy) {
+static Function *getContinuationGetAddrAndMD(Module &M) {
   auto *Name = "continuation.getAddrAndMD";
   if (auto *F = M.getFunction(Name))
     return F;
   auto &C = M.getContext();
-  auto *FuncTy = FunctionType::get(RetTy, {PointerType::get(C, 0)}, false);
+  auto *FuncTy = FunctionType::get(Type::getInt32Ty(M.getContext()), {PointerType::get(C, 0)}, false);
 
   return cast<Function>(M.getOrInsertFunction(Name, FuncTy).getCallee());
 }
 
-/// Checks some properties guaranteed for a module containing continuations
-/// as expected by the backend.
-[[maybe_unused]] static void checkContinuationsModule(const Module &M) {
-  // Check that resume functions do not have a stack size set.
-  for (auto &Func : M) {
-    if (auto *MD = dyn_cast_or_null<MDTuple>(Func.getMetadata(ContHelper::MDContinuationName))) {
-      auto *StartFunc = extractFunctionOrNull(MD->getOperand(0));
-      bool IsStart = (&Func == StartFunc);
-      bool HasStackSizeMetadata = ContHelper::StackSize::tryGetValue(&Func).has_value();
-      if (!IsStart && HasStackSizeMetadata)
-        report_fatal_error("Found resume function with stack size metadata!");
-    }
-  }
-}
-
-void DXILContPostProcessPassImpl::initializeProcessableFunctionData() {
-  for (Function &F : *Mod) {
-    if (F.isDeclaration())
-      continue;
-
-    auto Stage = lgc::rt::getLgcRtShaderStage(&F);
-    if (!Stage)
-      continue;
-
-    // For the kernel entry function in GPURT, we only care about its existence
-    // in @ToProcess, since we only want to create an alloca for the
-    // continuation stack pointer later (and do the lgc.cps lowering).
-    if (Stage == lgc::rt::RayTracingShaderStage::KernelEntry) {
-      FunctionData Data;
-      Data.Kind = DXILShaderKind::Compute;
-      [[maybe_unused]] bool DidInsert = ToProcess.insert({&F, std::move(Data)}).second;
-      assert(DidInsert);
-      continue;
-    }
-
-    // Handle start functions first
-    if (!llvm::isStartFunc(&F))
-      continue;
-
-    DXILShaderKind Kind = ShaderStageHelper::rtShaderStageToDxilShaderKind(*Stage);
-    const bool IsCpsFunction = lgc::cps::isCpsFunction(F);
-
-    switch (Kind) {
-    case DXILShaderKind::RayGeneration: {
-      FunctionData Data;
-      Data.Kind = Kind;
-
-      Data.SystemDataArgumentIndex = !IsCpsFunction ? SystemDataArgumentIndex : CpsArgIdxWithStackPtr::SystemData;
-
-      Data.SystemDataTy = F.getFunctionType()->getParamType(Data.SystemDataArgumentIndex);
-
-      [[maybe_unused]] bool DidInsert = ToProcess.insert({&F, std::move(Data)}).second;
-      assert(DidInsert);
-      break;
-    }
-    case DXILShaderKind::Intersection:
-    case DXILShaderKind::AnyHit:
-    case DXILShaderKind::ClosestHit:
-    case DXILShaderKind::Miss:
-    case DXILShaderKind::Callable: {
-      FunctionData Data;
-      Data.Kind = Kind;
-
-      Data.SystemDataArgumentIndex = !IsCpsFunction ? SystemDataArgumentIndex : CpsArgIdxWithStackPtr::SystemData;
-      Data.SystemDataTy = F.getFunctionType()->getParamType(Data.SystemDataArgumentIndex);
-      [[maybe_unused]] bool DidInsert = ToProcess.insert({&F, std::move(Data)}).second;
-      assert(DidInsert);
-      break;
-    }
-    default:
-      break;
-    }
-  }
-
-  // Also find continuation parts of the functions
-  for (auto &F : *Mod) {
-    if (F.isDeclaration())
-      continue;
-    if (auto *MD = dyn_cast_or_null<MDTuple>(F.getMetadata(ContHelper::MDContinuationName))) {
-      auto *EntryF = extractFunctionOrNull(MD->getOperand(0));
-      auto Stage = lgc::rt::getLgcRtShaderStage(EntryF);
-      if (Stage && &F != EntryF) {
-        FunctionData Data = ToProcess[EntryF];
-        Data.IsStart = false;
-
-        Data.SystemDataArgumentIndex =
-            !lgc::cps::isCpsFunction(F) ? SystemDataArgumentIndex : CpsArgIdxWithStackPtr::SystemData;
-
-        Data.SystemDataTy = F.getArg(Data.SystemDataArgumentIndex)->getType();
-        [[maybe_unused]] bool DidInsert = ToProcess.insert({&F, std::move(Data)}).second;
-        assert(DidInsert);
-      }
-    }
-  }
-}
-
-//
-// Entry point for all lgc.cps lowering.
-//
-bool DXILContPostProcessPassImpl::lowerCpsOps() {
-  bool Changed = false;
-
-  struct CpsVisitorState {
-    DXILContPostProcessPassImpl &Self;
-    bool &Changed;
-    llvm_dialects::Builder &Builder;
-  };
-
-  // Note: It is a bit unlucky that we are using both a visitor for
-  // lgc.cps.as.continuation.reference and lgc.cps.jump and a loop for the
-  // actual stack lowering. It would be nice to use a visitor for both of them,
-  // but currently, there seems to be no support in dialects for marrying both
-  // approaches: we would need a visitor that supports visiting function
-  // definitions as well.
-  static const auto CpsVisitor = llvm_dialects::VisitorBuilder<CpsVisitorState>()
-                                     .add<lgc::cps::AsContinuationReferenceOp>(
-                                         [](CpsVisitorState &State, lgc::cps::AsContinuationReferenceOp &AsCrOp) {
-                                           State.Self.lowerAsContinuationReferenceOp(AsCrOp);
-                                           State.Changed = true;
-                                         })
-                                     .add<lgc::cps::JumpOp>([](CpsVisitorState &State, lgc::cps::JumpOp &JumpOp) {
-                                       State.Self.lowerJumpOp(JumpOp);
-                                       State.Changed = true;
-                                     })
-                                     .build();
-
-  CpsVisitorState State{*this, Changed, Builder};
-
-  for (Function &Func : *Mod) {
-    if (Func.isDeclaration())
-      continue;
-
-    if (lgc::rt::getLgcRtShaderStage(&Func) == lgc::rt::RayTracingShaderStage::KernelEntry ||
-        Func.hasMetadata(ContHelper::MDContinuationName) || lgc::cps::isCpsFunction(Func)) {
-      // Lower lgc.cps.jump and lgc.cps.as.continuation.reference ops.
-      CpsVisitor.visit(State, Func);
-    }
-  }
-
-  return Changed;
-}
-
 Value *DXILContPostProcessPassImpl::ensure64BitAddr(Value *Src) {
   Type *SrcTy = Src->getType();
   Type *I64 = Builder.getInt64Ty();
@@ -299,55 +108,91 @@ void DXILContPostProcessPassImpl::lowerJumpOp(lgc::cps::JumpOp &JumpOp) {
   SmallVector<Value *> TailArgs{JumpOp.getTail()};
 
   Value *JumpTarget = ensure64BitAddr(JumpOp.getTarget());
+  Value *ShaderIndex = JumpOp.getShaderIndex();
   Value *RetAddr = JumpOp.getRcr();
   if (ContHelper::isWaitAwaitCall(JumpOp)) {
-    ContinueOp = Builder.create<lgc::ilcps::WaitContinueOp>(JumpTarget, Builder.getInt64(-1), JumpOp.getCsp(), RetAddr,
-                                                            TailArgs);
+    ContinueOp = Builder.create<lgc::ilcps::WaitContinueOp>(JumpTarget, Builder.getInt64(-1), JumpOp.getCsp(),
+                                                            ShaderIndex, RetAddr, TailArgs);
     ContHelper::removeWaitMask(JumpOp);
   } else {
-    ContinueOp = Builder.create<lgc::ilcps::ContinueOp>(JumpTarget, JumpOp.getCsp(), RetAddr, TailArgs);
+    ContinueOp = Builder.create<lgc::ilcps::ContinueOp>(JumpTarget, JumpOp.getCsp(), ShaderIndex, RetAddr, TailArgs);
   }
 
   ContinueOp->copyMetadata(JumpOp);
   JumpOp.eraseFromParent();
+
+  ContHelper::OutgoingRegisterCount::reset(ContinueOp);
+  ContHelper::ReturnedRegisterCount::reset(ContinueOp);
 }
 
-void DXILContPostProcessPassImpl::lowerAsContinuationReferenceOp(lgc::cps::AsContinuationReferenceOp &AsCrOp) {
+void DXILContPostProcessPassImpl::lowerAsContinuationReferenceOp(lgc::cps::AsContinuationReferenceOp &AsCrOp,
+                                                                 Function *GetContinuationAddrAndMD) {
   Builder.SetInsertPoint(&AsCrOp);
 
-  Value *AddrWithMD = Builder.CreateCall(getContinuationGetAddrAndMD(*Mod, AsCrOp.getType()), {AsCrOp.getFn()});
+  Value *AddrWithMD = Builder.CreateCall(GetContinuationAddrAndMD, {AsCrOp.getFn()});
 
   AsCrOp.replaceAllUsesWith(AddrWithMD);
   AsCrOp.eraseFromParent();
 }
 
-DXILContPostProcessPassImpl::DXILContPostProcessPassImpl(Module &M, Module &GpurtLibrary)
-    : Mod{&M}, GpurtLibrary{&GpurtLibrary}, Builder{Mod->getContext()}, StackAddrspace{
-                                                                            ContHelper::tryGetStackAddrspace(*Mod)} {
+DXILContPostProcessPassImpl::DXILContPostProcessPassImpl(Module &M) : Mod{&M}, Builder{Mod->getContext()} {
 }
 
 PreservedAnalyses DXILContPostProcessPassImpl::run(ModuleAnalysisManager &AnalysisManager) {
-  bool Changed = false;
+  struct ProcessingState {
+    DXILContPostProcessPassImpl &Self;
+    bool Changed;
+    Function *GetContinuationAddrAndMD;
 
-  initializeProcessableFunctionData();
+    llvm::PreservedAnalyses getPreservedAnalyses() {
+      if (Changed)
+        return PreservedAnalyses::none();
 
-  for (auto &[Func, Data] : ToProcess) {
-    ContHelper::IncomingRegisterCount::reset(Func);
-    ContHelper::ContinuationStateByteCount::reset(Func);
-  }
+      return PreservedAnalyses::all();
+    }
+  };
+
+  ProcessingState State{*this, false, getContinuationGetAddrAndMD(*Mod)};
+
+  static const auto CpsVisitor =
+      llvm_dialects::VisitorBuilder<ProcessingState>()
+          .add<lgc::cps::AsContinuationReferenceOp>(
+              [](ProcessingState &State, lgc::cps::AsContinuationReferenceOp &AsCrOp) {
+                State.Self.lowerAsContinuationReferenceOp(AsCrOp, State.GetContinuationAddrAndMD);
+                State.Changed = true;
+              })
+          .add<lgc::cps::JumpOp>([](ProcessingState &State, lgc::cps::JumpOp &JumpOp) {
+            State.Self.lowerJumpOp(JumpOp);
+            State.Changed = true;
+          })
+          .build();
+
+  for (Function &F : *Mod) {
+    if (F.isDeclaration())
+      continue;
+
+    auto Stage = lgc::rt::getLgcRtShaderStage(&F);
+    if (!Stage)
+      continue;
 
-  Changed |= lowerCpsOps();
+    if (Stage == lgc::rt::RayTracingShaderStage::KernelEntry || F.hasMetadata(ContHelper::MDContinuationName) ||
+        lgc::cps::isCpsFunction(F)) {
+      // Lower lgc.cps.jump and lgc.cps.as.continuation.reference ops.
+      CpsVisitor.visit(State, F);
+    }
 
-  Changed |= fixupDxilMetadata(*Mod);
-  Changed |= cleanupOutgoingPayloadMetadata();
+    if (Stage == lgc::rt::RayTracingShaderStage::Traversal)
+      continue;
+
+    ContHelper::IncomingRegisterCount::reset(&F);
+    ContHelper::ContinuationStateByteCount::reset(&F);
+  }
 
-#ifndef NDEBUG
-  checkContinuationsModule(*Mod);
-#endif
+  State.Changed |= fixupDxilMetadata(*Mod);
 
-  Changed |= llvm::removeUnusedFunctionDecls(Mod, false);
+  State.Changed |= llvm::removeUnusedFunctionDecls(Mod, false);
 
-  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+  return State.getPreservedAnalyses();
 }
 } // anonymous namespace
 
@@ -356,7 +201,6 @@ llvm::PreservedAnalyses DXILContPostProcessPass::run(llvm::Module &Module,
   LLVM_DEBUG(dbgs() << "Run the pass dxil-cont-post-process\n");
   AnalysisManager.getResult<DialectContextAnalysis>(Module);
 
-  auto &GpurtContext = lgc::GpurtContext::get(Module.getContext());
-  DXILContPostProcessPassImpl Impl{Module, GpurtContext.theModule ? *GpurtContext.theModule : Module};
+  DXILContPostProcessPassImpl Impl{Module};
   return Impl.run(AnalysisManager);
 }
diff --git a/llvmraytracing/lib/DXILContPrepareGpurtLibrary.cpp b/llvmraytracing/lib/DXILContPrepareGpurtLibrary.cpp
index 69a4a8f823..8f3aad0a32 100644
--- a/llvmraytracing/lib/DXILContPrepareGpurtLibrary.cpp
+++ b/llvmraytracing/lib/DXILContPrepareGpurtLibrary.cpp
@@ -36,6 +36,7 @@
 #include "compilerutils/DxilUtils.h"
 #include "llvmraytracing/Continuations.h"
 #include "llvmraytracing/ContinuationsUtil.h"
+#include "lgc/LgcIlCpsDialect.h"
 #include "lgc/LgcRtDialect.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallVector.h"
@@ -133,7 +134,6 @@ static bool isUtilFunction(StringRef Name) {
       "GetCurrentFuncAddr",
       "GetFuncAddr",
       "GetI32",
-      "GetLocalRootIndex",
       "GetResumePointAddr",
       "GetRtip",
       "GetSetting",
@@ -183,6 +183,20 @@ static void handleGetShaderRecordIndex(llvm_dialects::Builder &B, Function &Func
   });
 }
 
+/// Restore the local root index after calls to some function, Func.
+/// Currently, Func is some Await intrinsic, and we iterate over all its call instructions to insert the call to
+/// lgc.ilcps.setLocalRootIndex after it. This has the effect of not running into trouble with a mangled call to
+/// lgc.ilcps.setLocalRootIndex after cross-module inlining and helps us with determining a basic block split point
+/// later. We need that split point to ensure lgc.ilcps.setLocalRootIndex is called before resource accesses that depend
+/// on the local root index occur.
+static void restoreLocalRootIndex(llvm_dialects::Builder &B, Function &Func) {
+  llvm::forEachCall(Func, [&](CallInst &CInst) {
+    B.SetInsertPoint(++CInst.getIterator());
+    auto *ShaderIndexCall = B.create<lgc::rt::ShaderIndexOp>();
+    B.create<lgc::ilcps::SetLocalRootIndexOp>(ShaderIndexCall);
+  });
+}
+
 llvm::PreservedAnalyses DXILContPrepareGpurtLibraryPass::run(llvm::Module &M,
                                                              llvm::ModuleAnalysisManager &AnalysisManager) {
   LLVM_DEBUG(dbgs() << "Run the dxil-cont-prepare-gpurt-library pass\n");
@@ -205,6 +219,8 @@ llvm::PreservedAnalyses DXILContPrepareGpurtLibraryPass::run(llvm::Module &M,
     } else if (Name.contains("_Amd")) {
       if (isUtilFunction(Name)) {
         ShouldTransform = true;
+        if (Name.contains("Await"))
+          restoreLocalRootIndex(B, *F);
       } else if (Name.contains("IsLlpc")) {
         ShouldTransform = false;
         handleIsLlpc(*F);
diff --git a/llvmraytracing/lib/LgcCpsJumpInliner.cpp b/llvmraytracing/lib/LgcCpsJumpInliner.cpp
index 4e10b3845f..c0afacbbf2 100644
--- a/llvmraytracing/lib/LgcCpsJumpInliner.cpp
+++ b/llvmraytracing/lib/LgcCpsJumpInliner.cpp
@@ -92,7 +92,7 @@ PreservedAnalyses LgcCpsJumpInlinerPassImpl::run() {
     assert(JumpTargetFunc && !JumpTargetFunc->isDeclaration());
 
     Builder.SetInsertPoint(Jump);
-    SmallVector<Value *> ArgList{Jump->getRcr()};
+    SmallVector<Value *> ArgList{Jump->getShaderIndex(), Jump->getRcr()};
 
     ArgList.append(Jump->getTail().begin(), Jump->getTail().end());
 
diff --git a/llvmraytracing/lib/LowerAwait.cpp b/llvmraytracing/lib/LowerAwait.cpp
index 5bdc56294a..3c6849a08b 100644
--- a/llvmraytracing/lib/LowerAwait.cpp
+++ b/llvmraytracing/lib/LowerAwait.cpp
@@ -72,21 +72,17 @@ void LowerAwaitPassImpl::processContinuations() {
   // If this is the first time we've done this for this function
   //   Insert the required calls at the start of the function:
   //       id     = llvm.coro.id.retcon
-  //       handle = llvm.coro.begin id
+  //
+  //       handle = llvm.coro.begin.custom.abi id, ptr, i32 custom_index
   //   Change the return type of the function to the await token
   // Replace the call with
   //    co.flag = llvm.coro.suspend.retcon
   //       unreachable
   auto &Context = Mod.getContext();
-  auto *I8Ptr = Type::getInt8Ty(Context)->getPointerTo();
+  auto *PtrTy = PointerType::get(Context, 0);
   auto *I32 = Type::getInt32Ty(Context);
 
-  Type *TokenTy = StructType::create(Context, "continuation.token")->getPointerTo();
-
-  SmallVector<Type *> ReturnTypes;
-  ReturnTypes.push_back(I8Ptr);   // Continue function pointer
-  ReturnTypes.push_back(TokenTy); // Token to connect the function call with the resume point
-  StructType *NewRetTy = StructType::get(Context, ReturnTypes);
+  StructType *NewRetTy = StructType::get(Context, {PtrTy, PtrTy});
 
   for (auto &FuncData : ToProcess) {
     Function *F = FuncData.first;
@@ -103,7 +99,7 @@ void LowerAwaitPassImpl::processContinuations() {
 
     // Add new storage pointer for the coroutine passes to new function type at
     // the end
-    AllArgTypes.push_back(I8Ptr);
+    AllArgTypes.push_back(PtrTy);
 
     // Create new empty function
     auto *NewFuncTy = FunctionType::get(NewRetTy, AllArgTypes, false);
@@ -135,7 +131,7 @@ void LowerAwaitPassImpl::processContinuations() {
     SmallVector<char> StrBuf;
     auto *ContProtoFunc = cast<Function>(
         Mod.getOrInsertFunction((Twine("continuation.prototype.") + NewFunc->getName()).toStringRef(StrBuf),
-                                FunctionType::get(NewRetTy, {I8Ptr, Type::getInt1Ty(Context)}, false))
+                                FunctionType::get(NewRetTy, {PtrTy, Type::getInt1Ty(Context)}, false))
             .getCallee());
 
     // Add metadata, marking it as a continuation function
@@ -143,16 +139,16 @@ void LowerAwaitPassImpl::processContinuations() {
     NewFunc->setMetadata(ContHelper::MDContinuationName, ContMDTuple);
     ContProtoFunc->setMetadata(ContHelper::MDContinuationName, ContMDTuple);
 
-    auto *ContProtoFuncPtr = ConstantExpr::getBitCast(ContProtoFunc, I8Ptr);
+    auto *ContProtoFuncPtr = ConstantExpr::getBitCast(ContProtoFunc, PtrTy);
 
     // Alloc and free prototypes too
-    auto *ContMallocTy = FunctionType::get(I8Ptr, {I32}, false);
+    auto *ContMallocTy = FunctionType::get(PtrTy, {I32}, false);
     auto *ContMalloc = dyn_cast<Function>(Mod.getOrInsertFunction("continuation.malloc", ContMallocTy).getCallee());
-    auto *ContMallocPtr = ConstantExpr::getBitCast(ContMalloc, I8Ptr);
+    auto *ContMallocPtr = ConstantExpr::getBitCast(ContMalloc, PtrTy);
 
-    auto *ContDeallocTy = FunctionType::get(Type::getVoidTy(Context), {I8Ptr}, false);
+    auto *ContDeallocTy = FunctionType::get(Type::getVoidTy(Context), {PtrTy}, false);
     auto *ContDealloc = dyn_cast<Function>(Mod.getOrInsertFunction("continuation.free", ContDeallocTy).getCallee());
-    auto *ContDeallocPtr = ConstantExpr::getBitCast(ContDealloc, I8Ptr);
+    auto *ContDeallocPtr = ConstantExpr::getBitCast(ContDealloc, PtrTy);
 
     llvm_dialects::Builder B(&*NewFunc->getEntryBlock().getFirstNonPHIOrDbgOrAlloca());
     // Claim that the buffer has the minimum required size of a pointer
@@ -162,8 +158,14 @@ void LowerAwaitPassImpl::processContinuations() {
     Value *const CoroId =
         B.CreateIntrinsic(Intrinsic::coro_id_retcon, {},
                           {BufSize, BufAlign, StorageArg, ContProtoFuncPtr, ContMallocPtr, ContDeallocPtr});
-    auto *CPN = ConstantPointerNull::get(I8Ptr);
-    B.CreateIntrinsic(Intrinsic::coro_begin, {}, {CoroId, CPN});
+    auto *CPN = ConstantPointerNull::get(PtrTy);
+
+    // Only one custom ABI is provided to CoroSplitPass' constructor right
+    // now. In the future custom ABIs may be provided to CoroSplitPass and
+    // their indices specified here to control the coroutine's splitting,
+    // spilling, reloading, frame allocation, rematting, etc.
+    auto *CustomABIIndex = ConstantInt::get(I32, 0);
+    B.CreateIntrinsic(Intrinsic::coro_begin_custom_abi, {}, {CoroId, CPN, CustomABIIndex});
 
     // Replace await calls with suspend points
     for (auto *CI : FuncData.second) {
@@ -177,8 +179,8 @@ void LowerAwaitPassImpl::processContinuations() {
       }
 
       // Insert a dummy call to remember the arguments to lgc.cps.await.
-      auto *ShaderTy = FunctionType::get(TokenTy, ArgTys, false);
-      auto *ShaderFun = B.CreateIntToPtr(CI->getArgOperand(0), ShaderTy->getPointerTo());
+      auto *ShaderTy = FunctionType::get(PtrTy, ArgTys, false);
+      auto *ShaderFun = B.CreateIntToPtr(CI->getArgOperand(0), PointerType::get(Context, 0));
       SuspendRetconArg = B.CreateCall(ShaderTy, ShaderFun, Args);
       cast<CallInst>(SuspendRetconArg)->copyMetadata(*CI);
 
diff --git a/llvmraytracing/lib/LowerRaytracingPipeline.cpp b/llvmraytracing/lib/LowerRaytracingPipeline.cpp
index 2f71a9394b..69ab3afc68 100644
--- a/llvmraytracing/lib/LowerRaytracingPipeline.cpp
+++ b/llvmraytracing/lib/LowerRaytracingPipeline.cpp
@@ -66,10 +66,8 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
 #include <algorithm>
 #include <cassert>
-#include <cstddef>
 #include <cstdint>
 #include <optional>
 #include <utility>
@@ -191,9 +189,7 @@ struct PayloadCopyHelper {
       unsigned NumCopyBytes = RegisterBytes * Interval.size();
 
       unsigned FieldNumRemainingBytes = FieldNumBytes - *FieldByteOffset;
-      if (NumCopyBytes > FieldNumRemainingBytes) {
-        NumCopyBytes = FieldNumRemainingBytes;
-      }
+      NumCopyBytes = std::min(NumCopyBytes, FieldNumRemainingBytes);
 
       copyBytes(B, Dst, Src, NumCopyBytes);
       *FieldByteOffset += NumCopyBytes;
@@ -221,11 +217,12 @@ class ModuleMetadataState final {
 
   uint32_t getMaxUsedPayloadRegisterCount() const { return MaxUsedPayloadRegisterCount; }
 
-  // Returns whether a value for maxUsedPayloadRegisterCount was set in the input module.
-  // If that is the case, for driver functions we rely on it.
-  // This mechanism ensures we don't rely on it in case the value was only initialized
-  // during processing of the current module.
-  bool maxUsedPayloadRegisterCountWasSet() const { return MaxUsedPayloadRegisterCountWasSet; }
+  uint32_t getNumPassedThroughPayloadDwords() const {
+    if (MaxUsedPayloadRegisterCountWasSet)
+      return MaxUsedPayloadRegisterCount;
+
+    return MaxPayloadRegisterCount;
+  }
 
   uint32_t getMaxHitAttributeByteCount() const { return MaxHitAttributeByteCount; }
 
@@ -258,6 +255,10 @@ class ModuleMetadataState final {
   /// If the module has lgc.cps.module metadata attached.
   bool IsInLgcCpsMode = false;
 
+  // Describes whether a value for maxUsedPayloadRegisterCount was set in the input module.
+  // If that is the case, for driver functions we rely on it.
+  // This mechanism ensures we don't rely on it in case the value was only initialized
+  // during processing of the current module.
   bool MaxUsedPayloadRegisterCountWasSet = false;
 };
 
@@ -269,12 +270,24 @@ class LowerRaytracingPipelinePassImpl final {
 private:
   struct FunctionData {
     RayTracingShaderStage Kind = RayTracingShaderStage::Count;
+
+#define DECLARE_KIND_GETTER(Stage)                                                                                     \
+  bool is##Stage() const { return Kind == RayTracingShaderStage::Stage; }
+    DECLARE_KIND_GETTER(RayGeneration)
+    DECLARE_KIND_GETTER(Intersection)
+    DECLARE_KIND_GETTER(AnyHit)
+    DECLARE_KIND_GETTER(ClosestHit)
+    DECLARE_KIND_GETTER(Miss)
+    DECLARE_KIND_GETTER(Callable)
+    DECLARE_KIND_GETTER(Traversal)
+    DECLARE_KIND_GETTER(KernelEntry)
+#undef DECLARE_KIND_GETTER
+
     SmallVector<CallInst *> TraceRayCalls;
     SmallVector<CallInst *> ReportHitCalls;
     SmallVector<CallInst *> CallShaderCalls;
     /// Calls to hlsl intrinsics that cannot be rematerialized
     SmallVector<CallInst *> IntrinsicCalls;
-    SmallVector<CallInst *> ShaderIndexCalls;
     SmallVector<CallInst *> ShaderRecordBufferCalls;
     SmallVector<JumpOp *> JumpCalls;
 
@@ -394,16 +407,15 @@ class LowerRaytracingPipelinePassImpl final {
     /// Compute the dword at which payload starts in the argument at most in the
     /// argument list.
     std::optional<uint32_t> getPayloadStartDword(FunctionData &Data, uint32_t MaxHitAttributeBytes,
-                                                 Type *TraversalDataTy, bool CpsMode) {
+                                                 Type *TraversalDataTy) {
       if (Data.PayloadStorageTy->getArrayNumElements() == 0)
         return std::nullopt;
 
       assert(TraversalDataTy && "Failed to detect traversal system data type");
 
-      // For lgc.cps mode, take into account the shader index dword is inserted at a later stage.
       // Always ensure that we consider the two dword barycentric coordinates
       // passed as argument for _AmdEnqueueAnyHit calls.
-      return (CpsMode ? 1 : 0) + getArgumentDwordCount(DL, TraversalDataTy) +
+      return getArgumentDwordCount(DL, TraversalDataTy) +
 #if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 503627
              // Old version of the code
              std::max(divideCeil(MaxHitAttributeBytes, RegisterBytes), uint64_t(2));
@@ -419,15 +431,14 @@ class LowerRaytracingPipelinePassImpl final {
     /// Returns a pair (paddingType, payloadType).
     std::pair<Type *, Type *> computePaddingAndPayloadArgTys(SmallVectorImpl<Type *> &ArgTys,
                                                              uint32_t PayloadSizeDwords,
-                                                             std::optional<uint32_t> PayloadStartDword,
-                                                             uint32_t Offset = 0) {
+                                                             std::optional<uint32_t> PayloadStartDword) {
       Type *PaddingTy = nullptr;
-      const uint32_t ShiftedStartDword = PayloadStartDword.has_value() ? PayloadStartDword.value() - Offset : 0;
+      const uint32_t StartDword = PayloadStartDword.value_or(0);
 
 #ifndef NDEBUG
       LLVM_DEBUG(dbgs() << "Computing padding and payload based on following data:\n"
                         << "Payload size: " << PayloadSizeDwords << " dwords\n"
-                        << "Payload start dword: " << ShiftedStartDword << "\nArgument types:\n");
+                        << "Payload start dword: " << StartDword << "\nArgument types:\n");
       for (Type *Ty : ArgTys)
         LLVM_DEBUG(dbgs() << *Ty << ": " << lgc::cps::getArgumentDwordCount(DL, Ty) << " dwords\n");
 #endif
@@ -436,7 +447,7 @@ class LowerRaytracingPipelinePassImpl final {
       // If PayloadStartDword is set to std::nullopt, then we don't pass
       // payload, thus we don't need padding.
       if (PayloadStartDword.has_value()) {
-        PaddingTy = ContHelper::getPaddingType(DL, Mod.getContext(), ArgTys, ShiftedStartDword);
+        PaddingTy = ContHelper::getPaddingType(DL, Mod.getContext(), ArgTys, StartDword);
       } else {
         assert(PayloadSizeDwords == 0 && "PayloadHelper::computePaddingAndPayloadArgTys: Expected zero "
                                          "payload dwords!");
@@ -481,10 +492,9 @@ class LowerRaytracingPipelinePassImpl final {
   };
 
   void replaceCall(FunctionData &Data, CallInst *Call, Function *Func, ContinuationCallType CallType);
-  void handleRestoreSystemData(CallInst *Call);
   void handleExitRayGen(const FunctionData &Data);
-  void replaceContinuationCall(ContinuationCallType CallType, CallInst *Call, const FunctionData &Data,
-                               Value *PayloadOrAttrs, Type *PayloadOrAttrsTy);
+  void replaceAwaitCall(ContinuationCallType CallType, CallInst *Call, const FunctionData &Data, Value *PayloadOrAttrs,
+                        Type *PayloadOrAttrsTy);
   void replaceReportHitCall(FunctionData &Data, CallInst *Call);
 
   void replaceShaderIndexCall(FunctionData &Data, CallInst *Call);
@@ -492,12 +502,9 @@ class LowerRaytracingPipelinePassImpl final {
 
   void handleGetShaderKind(Function &Func);
   void handleGetCurrentFuncAddr(Function &Func);
-  void handleGetShaderRecIndex(Function &Func);
 
   void handleAmdInternalFunc(Function &Func);
 
-  void splitRestoreBB();
-
   void handleUnrematerializableCandidates();
 
   void collectGpuRtFunctions();
@@ -574,7 +581,6 @@ class LowerRaytracingPipelinePassImpl final {
   PayloadHelper PayloadHelper;
   CompilerUtils::CrossModuleInliner CrossInliner;
   Type *I32;
-  Type *TokenTy;
   /// System data type passed to Traversal
   Type *TraversalDataTy;
   /// System data type passed to ClosestHit and Miss
@@ -589,8 +595,6 @@ class LowerRaytracingPipelinePassImpl final {
   // system data
   Function *GetTriangleHitAttributes;
   Function *SetTriangleHitAttributes;
-  Function *GetLocalRootIndex;
-  Function *SetLocalRootIndex;
   Function *ExitRayGen;
   Function *TraceRay;
   Function *CallShader;
@@ -630,7 +634,7 @@ ModuleMetadataState::ModuleMetadataState(Module &Module) : Mod{Module} {
   auto StackAddrspaceMD = ContHelper::tryGetStackAddrspace(Module);
   StackAddrspace = StackAddrspaceMD.value_or(ContHelper::DefaultStackAddrspace);
 
-  IsInLgcCpsMode = ContHelper::isLgcCpsModule(Mod);
+  IsInLgcCpsMode = Mod.getNamedMetadata(ContHelper::MDLgcCpsModuleName) != nullptr;
 }
 
 /// Write the previously derived information about max payload registers and
@@ -664,19 +668,6 @@ CallInst *LowerRaytracingPipelinePassImpl::insertCpsAwait(Type *ReturnTy, Value
                                  Args);
 }
 
-Function *llvm::getSetLocalRootIndex(Module &M) {
-  auto *Name = "amd.dx.setLocalRootIndex";
-  if (auto *F = M.getFunction(Name))
-    return F;
-  auto &C = M.getContext();
-  auto *Void = Type::getVoidTy(C);
-  auto *I32 = Type::getInt32Ty(C);
-  auto *FuncTy = FunctionType::get(Void, {I32}, false);
-  AttributeList AL = AttributeList::get(C, AttributeList::FunctionIndex,
-                                        {Attribute::NoFree, Attribute::NoUnwind, Attribute::WillReturn});
-  return cast<Function>(M.getOrInsertFunction(Name, FuncTy, AL).getCallee());
-}
-
 // Set maximum continuation stack size metadata
 static void setStacksizeMetadata(Function &F, uint64_t NeededStackSize) {
   uint64_t CurStackSize = ContHelper::StackSize::tryGetValue(&F).value_or(0);
@@ -691,9 +682,8 @@ static SmallVector<Value *, 3> flattenVectorArgument(IRBuilder<> &B, Value *Vect
 
   SmallVector<Value *, 3> Arguments;
 
-  for (unsigned Idx = 0; Idx < cast<FixedVectorType>(Vector->getType())->getNumElements(); ++Idx) {
+  for (unsigned Idx = 0; Idx < cast<FixedVectorType>(Vector->getType())->getNumElements(); ++Idx)
     Arguments.push_back(B.CreateExtractElement(Vector, B.getInt32(Idx)));
-  }
 
   return Arguments;
 }
@@ -704,11 +694,9 @@ static SmallVector<Value *, 3> flattenVectorArgument(IRBuilder<> &B, Value *Vect
 static bool flattenVectorArgument(IRBuilder<> &B, Value *Arg, SmallVectorImpl<Value *> &Arguments) {
   if (isa<FixedVectorType>(Arg->getType())) {
     const auto &FlattenedArguments = flattenVectorArgument(B, Arg);
-    if (!FlattenedArguments.empty()) {
-      Arguments.append(FlattenedArguments.begin(), FlattenedArguments.end());
+    Arguments.append(FlattenedArguments.begin(), FlattenedArguments.end());
 
-      return true;
-    }
+    return !FlattenedArguments.empty();
   }
 
   return false;
@@ -778,7 +766,7 @@ void LowerRaytracingPipelinePassImpl::replaceCall(FunctionData &Data, CallInst *
         if (!Callee)
           continue;
         auto FuncName = Callee->getName();
-        if (FuncName.starts_with("_AmdAwait") || FuncName.starts_with("_AmdWaitAwait")) {
+        if (FuncName.starts_with("_AmdAwait")) {
           AwaitCalls.push_back(CI);
         } else if (FuncName.starts_with("_AmdAcceptHitAttributes")) {
           AcceptHitAttrsCalls.push_back(CI);
@@ -789,7 +777,7 @@ void LowerRaytracingPipelinePassImpl::replaceCall(FunctionData &Data, CallInst *
 
   for (auto *CI : AwaitCalls) {
     Builder.SetInsertPoint(CI);
-    replaceContinuationCall(CallType, CI, Data, PayloadOrAttrs, PayloadOrAttrsTy);
+    replaceAwaitCall(CallType, CI, Data, PayloadOrAttrs, PayloadOrAttrsTy);
   }
 
   for (auto *CI : AcceptHitAttrsCalls) {
@@ -806,25 +794,6 @@ void LowerRaytracingPipelinePassImpl::replaceCall(FunctionData &Data, CallInst *
   Builder.SetInsertPoint(AfterCall);
 }
 
-void LowerRaytracingPipelinePassImpl::handleRestoreSystemData(CallInst *Call) {
-  // Store system data
-  auto *SystemDataTy = cast<StructType>(getFuncArgPtrElementType(Call->getCalledFunction(), 0));
-  auto *SystemData = Call->getArgOperand(0);
-
-  // Set local root signature on re-entry
-  auto *LocalIndexSystemDataTy = cast<StructType>(getFuncArgPtrElementType(GetLocalRootIndex, 0));
-  auto *LocalIndexSystemData = getDXILSystemData(Builder, SystemData, SystemDataTy, LocalIndexSystemDataTy);
-
-  auto Stage = getLgcRtShaderStage(Call->getFunction());
-  Value *LocalIndex = nullptr;
-  if (Stage == RayTracingShaderStage::RayGeneration)
-    LocalIndex = Builder.getInt32(0);
-  else
-    LocalIndex = CrossInliner.inlineCall(Builder, GetLocalRootIndex, LocalIndexSystemData).returnValue;
-  LocalIndex->setName("local.root.index");
-  Builder.CreateCall(SetLocalRootIndex, LocalIndex);
-}
-
 /// Replace a call to lgc.rt.report.hit with a call to the driver
 /// implementation.
 void LowerRaytracingPipelinePassImpl::replaceReportHitCall(FunctionData &Data, CallInst *Call) {
@@ -846,9 +815,9 @@ void LowerRaytracingPipelinePassImpl::replaceReportHitCall(FunctionData &Data, C
 
 /// Replace a call to Await with a call to a given address and pass generated
 /// token into an await call
-void LowerRaytracingPipelinePassImpl::replaceContinuationCall(ContinuationCallType CallType, CallInst *Call,
-                                                              const FunctionData &Data, Value *PayloadOrAttrs,
-                                                              Type *PayloadOrAttrsTy) {
+void LowerRaytracingPipelinePassImpl::replaceAwaitCall(ContinuationCallType CallType, CallInst *Call,
+                                                       const FunctionData &Data, Value *PayloadOrAttrs,
+                                                       Type *PayloadOrAttrsTy) {
   Builder.SetInsertPoint(Call);
 
   const PAQSerializationLayout *OutgoingSerializationLayout = nullptr;
@@ -871,7 +840,6 @@ void LowerRaytracingPipelinePassImpl::replaceContinuationCall(ContinuationCallTy
           std::min(std::max(TraceRayInfo->LayoutsByKind[PAQSerializationLayoutKind::ClosestHitOut].NumStorageI32s,
                             TraceRayInfo->LayoutsByKind[PAQSerializationLayoutKind::MissOut].NumStorageI32s),
                    MetadataState.getMaxPayloadRegisterCount());
-
     } else {
       assert(CallType == ContinuationCallType::CallShader && "Unexpected call type!");
       const auto *CallShaderInfo = &PAQManager.getOrCreateCallShaderSerializationInfo(PAQConfig);
@@ -884,8 +852,8 @@ void LowerRaytracingPipelinePassImpl::replaceContinuationCall(ContinuationCallTy
     assert(OutgoingSerializationLayout && "Missing serialization layout!");
   } else {
     assert(CallType == ContinuationCallType::AnyHit && "Unexpected call type!");
-    // For intersection, assume maximum possible number of payload registers.
-    ReturnedRegisterCount = MetadataState.getMaxPayloadRegisterCount();
+    // For intersection, use number of passed through payload registers.
+    ReturnedRegisterCount = Data.NumPassedThroughPayloadDwords;
   }
 
   if (OutgoingSerializationLayout) {
@@ -905,7 +873,7 @@ void LowerRaytracingPipelinePassImpl::replaceContinuationCall(ContinuationCallTy
 
       // Copy to payload storage
       Value *CastPayload = Builder.CreateBitCast(
-          Data.PayloadStorage, I32->getPointerTo(Data.PayloadStorage->getType()->getPointerAddressSpace()));
+          Data.PayloadStorage, Builder.getPtrTy(Data.PayloadStorage->getType()->getPointerAddressSpace()));
 
       Builder.CreateStore(LocalPayloadMem, CastPayload);
       // Set stacksize metadata on F
@@ -917,45 +885,23 @@ void LowerRaytracingPipelinePassImpl::replaceContinuationCall(ContinuationCallTy
                 *OutgoingSerializationLayout);
   }
 
-  auto *ShaderAddr = Call->getArgOperand(0);
-
   auto *FTy = Call->getFunctionType();
   SmallVector<Type *, 2> ArgTys;
   SmallVector<Value *, 2> Args;
 
-  Value *RetAddr = nullptr;
-  const bool IsLgcCpsMode = MetadataState.isInLgcCpsMode();
-  if (IsLgcCpsMode) {
-    // For LgcCps, skip function-addr, the return address will be filled at late
-    // stage of continuation transform. Add shader index so that the callee cps
-    // function get correct shader-index being passed in.
-    ArgTys.push_back(I32);
-    auto *ShaderIndex = CrossInliner
-                            .inlineCall(Builder, GetLocalRootIndex,
-                                        getDXILSystemData(Builder, Data.SystemData, Data.SystemDataTy,
-                                                          getFuncArgPtrElementType(GetLocalRootIndex, 0)))
-                            .returnValue;
-    Args.push_back(ShaderIndex);
-
-    ArgTys.append(FTy->param_begin() + 2, FTy->param_end());
-    Args.append(Call->arg_begin() + 2, Call->arg_end());
-  } else {
-    // We want to avoid having the return address included in the padding
-    // computation, since it is included nowhere else. This allows us to compute
-    // padding only on the actual tail arguments, which is the only varying part
-    // of the final continue call at the end.
-    uint32_t RetAddrArgIndex = 1;
-    if (CallType == ContinuationCallType::Traversal) {
-      RetAddr = PoisonValue::get(Builder.getInt32Ty());
-    } else {
-      RetAddr = Call->getArgOperand(RetAddrArgIndex);
-      assert(RetAddr->getType()->isIntegerTy(32));
-      ++RetAddrArgIndex;
-    }
+  constexpr uint32_t ShaderIndexArgIdx = 1;
+  Value *ShaderIndex = Call->getArgOperand(ShaderIndexArgIdx);
+  assert(ShaderIndex->getType()->isIntegerTy(32));
 
-    ArgTys.append(FTy->param_begin() + RetAddrArgIndex, FTy->param_end());
-    Args.append(Call->arg_begin() + RetAddrArgIndex, Call->arg_end());
-  }
+  // We need to identify the tail argument list here, since this is what we need to use for computing the padding.
+  // That means, the first argument behind the return address is our start index.
+  constexpr uint32_t RetAddrArgIdx = ShaderIndexArgIdx + 1;
+  Value *RetAddr = Call->getArgOperand(RetAddrArgIdx);
+  assert(RetAddr->getType()->isIntegerTy(32));
+  constexpr uint32_t TailArgStartIdx = RetAddrArgIdx + 1;
+
+  ArgTys.append(FTy->param_begin() + TailArgStartIdx, FTy->param_end());
+  Args.append(Call->arg_begin() + TailArgStartIdx, Call->arg_end());
 
   if (CallType == ContinuationCallType::AnyHit) {
     // Add hit attributes to arguments
@@ -973,8 +919,6 @@ void LowerRaytracingPipelinePassImpl::replaceContinuationCall(ContinuationCallTy
                                      MetadataState.getMaxPayloadRegisterCount());
   }
 
-  SmallVector<Type *> ReturnedArgTys{Call->getType()};
-
   const bool HasPayload = Data.FirstPayloadArgumentDword.has_value();
 
   // Add padding so that returned payload starts at a fixed dword.
@@ -985,24 +929,22 @@ void LowerRaytracingPipelinePassImpl::replaceContinuationCall(ContinuationCallTy
     Args.push_back(Builder.CreateLoad(OutgoingPayloadTy, Data.PayloadStorage));
   }
 
-  uint32_t PaddingOffset = 1;
-  if (!IsLgcCpsMode) {
-    // Compute padding for the resume function so that payload starts at a
-    // fixed dword. NOTE: Minus 1 as in lgc.cps mode, shader index (i32) is not included.
-    PaddingOffset = 0;
-    // Patch the return address into the await call, since it got excluded for
-    // the padding computation previously. For WaitAwaitTraversal, this needs to
-    // be removed later once we have the TraversalEntry function.
-    Args.insert(Args.begin(), RetAddr);
-  }
+  // Compute padding for the resume function so that payload starts at a
+  // fixed dword.
+  // Patch the return address into the await call, since it got excluded for
+  // the padding computation previously.
+  Args.insert(Args.begin(), {ShaderIndex, RetAddr});
 
+  SmallVector<Type *> ReturnedArgTys{Call->getType()};
   if (HasPayload) {
     PayloadHelper.computePaddingAndPayloadArgTys(ReturnedArgTys, ReturnedRegisterCount.value(),
-                                                 Data.FirstPayloadArgumentDword, PaddingOffset);
+                                                 Data.FirstPayloadArgumentDword);
   }
 
+  // Return shader record index + return address
+  ReturnedArgTys.insert(ReturnedArgTys.begin(), {Builder.getInt32Ty(), Builder.getInt32Ty()});
   auto *NewRetTy = StructType::get(Builder.getContext(), ReturnedArgTys);
-
+  auto *ShaderAddr = Call->getArgOperand(0);
   auto *NewCall = insertCpsAwait(NewRetTy, ShaderAddr, Call, Args, CallType, Data.Kind);
   NewCall->copyMetadata(*Call);
 
@@ -1013,13 +955,10 @@ void LowerRaytracingPipelinePassImpl::replaceContinuationCall(ContinuationCallTy
 
   ContHelper::ReturnedRegisterCount::setValue(NewCall, ReturnedRegisterCount.value());
 
-  auto OutgoingRegisterCount = std::min(OutgoingSerializationLayout ? OutgoingSerializationLayout->NumStorageI32s
-                                                                    : MetadataState.getMaxPayloadRegisterCount(),
-                                        MetadataState.getMaxPayloadRegisterCount());
   // Annotate call with the number of registers used for payload
-  ContHelper::OutgoingRegisterCount::setValue(NewCall, OutgoingRegisterCount);
+  ContHelper::OutgoingRegisterCount::setValue(NewCall, OutgoingPayloadDwords);
   if (OutgoingSerializationLayout) {
-    MetadataState.updateMaxUsedPayloadRegisterCount(OutgoingRegisterCount);
+    MetadataState.updateMaxUsedPayloadRegisterCount(OutgoingPayloadDwords);
     MetadataState.updateMaxUsedPayloadRegisterCount(ReturnedRegisterCount.value());
   }
 
@@ -1041,34 +980,48 @@ void LowerRaytracingPipelinePassImpl::replaceContinuationCall(ContinuationCallTy
   }
 
   if (!Call->getType()->isVoidTy()) {
-    // Extract the system data from the { %systemData, %padding, %payload }
+    // Extract the system data from the { %shaderIndex, %rcr, %systemData, %padding, %payload }
     // struct returned by the await call.
-    Call->replaceAllUsesWith(Builder.CreateExtractValue(NewCall, 0));
+    Value *ReturnedSystemData = Builder.CreateExtractValue(NewCall, 2);
+    Call->replaceAllUsesWith(ReturnedSystemData);
+
+    // Find (whatever comes first) the last store of the returned system data or a call to lgc.ilcps.setLocalRootIndex.
+    // We use this as split point as described below.
+    Instruction *SplitPoint = nullptr;
+    auto *ParentBB = Call->getParent();
+    for (auto It = ParentBB->rbegin(); It != ParentBB->rend(); ++It) {
+      if (auto *Store = dyn_cast<StoreInst>(&*It); Store && Store->getValueOperand() == ReturnedSystemData) {
+        SplitPoint = Store;
+        break;
+      }
+
+      if (auto *Idx = dyn_cast<lgc::ilcps::SetLocalRootIndexOp>(&*It)) {
+        SplitPoint = Idx;
+        break;
+      }
+    }
+
+    // After the await, we reset local state (system data, potentially the local root index).
+    // We need to ensure that any code rematerialized by coro passes to after the suspend point is placed after these
+    // restores. As we currently do not have a robust way to achieve that, work around the problem by splitting the BB
+    // after the restore code, relying on coro passes to rematerialize within the same BB as the usage.
+    if (SplitPoint) {
+      auto *Next = &*++SplitPoint->getIterator();
+      if (!Next->isTerminator())
+        SplitBlock(Next->getParent(), Next);
+    }
   }
 
   Call->eraseFromParent();
 }
 
-/// Replace a call to lgc.rt.shader.index with the passed shader index argument
-/// for LgcCps mode or get the value from system data for non-LgcCps mode.
+/// Replace a call to lgc.rt.shader.index with the passed shader index argument.
 void LowerRaytracingPipelinePassImpl::replaceShaderIndexCall(FunctionData &Data, CallInst *Call) {
-  if (Data.Kind == RayTracingShaderStage::RayGeneration) {
+  if (Data.isRayGeneration())
     Call->replaceAllUsesWith(Builder.getInt32(0));
-  } else {
-    Value *ShaderIndex = nullptr;
-    if (MetadataState.isInLgcCpsMode()) {
-      ShaderIndex = Call->getFunction()->getArg(CpsArgIdx::ShaderIndex);
-    } else {
-      assert(Data.SystemDataFirstStore != nullptr);
-      Builder.SetInsertPoint(&*++Data.SystemDataFirstStore->getIterator());
-      ShaderIndex = CrossInliner
-                        .inlineCall(Builder, GetLocalRootIndex,
-                                    getDXILSystemData(Builder, Data.SystemData, Data.SystemDataTy,
-                                                      getFuncArgPtrElementType(GetLocalRootIndex, 0)))
-                        .returnValue;
-    }
-    Call->replaceAllUsesWith(ShaderIndex);
-  }
+  else
+    Call->replaceAllUsesWith(Call->getFunction()->getArg(CpsArgIdx::ShaderIndex));
+
   Call->eraseFromParent();
 }
 
@@ -1292,7 +1245,7 @@ void LowerRaytracingPipelinePassImpl::copyHitAttributes(FunctionData &Data, Valu
   auto InsertPoint = Builder.saveIP();
   Builder.SetInsertPoint(Builder.GetInsertBlock()->getParent()->getEntryBlock().getFirstNonPHI());
   auto *InlineHitAttrsAlloc = Builder.CreateAlloca(InlineHitAttrsTy);
-  auto *RegTyPtr = RegTy->getPointerTo(InlineHitAttrsAlloc->getAddressSpace());
+  auto *RegTyPtr = Builder.getPtrTy(InlineHitAttrsAlloc->getAddressSpace());
   Builder.restoreIP(InsertPoint);
   auto *InlineHitAttrs = Builder.CreateBitCast(InlineHitAttrsAlloc, RegTyPtr);
 
@@ -1319,8 +1272,7 @@ void LowerRaytracingPipelinePassImpl::copyHitAttributes(FunctionData &Data, Valu
 
       // Obtain pointer to global payload serialization struct
       Value *PayloadSerialization = Builder.CreateBitCast(
-          Data.PayloadStorage,
-          Layout->SerializationTy->getPointerTo(Data.PayloadStorage->getType()->getPointerAddressSpace()));
+          Data.PayloadStorage, Builder.getPtrTy(Data.PayloadStorage->getType()->getPointerAddressSpace()));
       // Last zero yields pointer to the first element of the i32 array
       PayloadHitAttrs =
           Builder.CreateInBoundsGEP(Layout->SerializationTy, PayloadSerialization,
@@ -1330,7 +1282,7 @@ void LowerRaytracingPipelinePassImpl::copyHitAttributes(FunctionData &Data, Valu
       // Inline attributes suffice, nothing to do.
     }
   } else {
-    assert(Data.Kind == RayTracingShaderStage::Intersection && "Unexpected shader kind");
+    assert(Data.isIntersection() && "Unexpected shader kind");
     // We are in an intersection shader, which does not know the payload type.
     // Assume maximum possible size
     PayloadHitAttrBytes = MetadataState.getMaxHitAttributeByteCount() - InlineHitAttrsBytes;
@@ -1362,12 +1314,10 @@ void LowerRaytracingPipelinePassImpl::copyHitAttributes(FunctionData &Data, Valu
       Builder.CreateStore(Val, StorePtr);
     } else {
       // Load byte by byte into a vector and pad the rest with undef
-      auto *ByteLoadPtr = Builder.CreateBitCast(LoadPtr, I8Ty->getPointerTo());
-      auto *ByteStorePtr = Builder.CreateBitCast(StorePtr, I8Ty->getPointerTo());
       for (unsigned J = 0; J < HitAttrsBytes % RegisterBytes; J++) {
-        auto *Val = Builder.CreateLoad(
-            I8Ty, CompilerUtils::simplifyingCreateConstInBoundsGEP1_32(Builder, I8Ty, ByteLoadPtr, J));
-        Builder.CreateStore(Val, CompilerUtils::simplifyingCreateConstInBoundsGEP1_32(Builder, I8Ty, ByteStorePtr, J));
+        auto *Val =
+            Builder.CreateLoad(I8Ty, CompilerUtils::simplifyingCreateConstInBoundsGEP1_32(Builder, I8Ty, LoadPtr, J));
+        Builder.CreateStore(Val, CompilerUtils::simplifyingCreateConstInBoundsGEP1_32(Builder, I8Ty, StorePtr, J));
       }
     }
   }
@@ -1383,12 +1333,7 @@ void LowerRaytracingPipelinePassImpl::copyHitAttributes(FunctionData &Data, Valu
 }
 
 void LowerRaytracingPipelinePassImpl::setGpurtEntryRegisterCountMetadata() {
-  // Even if PreservedPayloadRegisterCount is set, there may be
-  // additional shaders in the current module whose usage is recorded
-  // in MaxUsedPayloadRegisterCount, to take the max with it.
-  uint32_t MaxRegisterCount = MetadataState.maxUsedPayloadRegisterCountWasSet()
-                                  ? MetadataState.getMaxUsedPayloadRegisterCount()
-                                  : MetadataState.getMaxPayloadRegisterCount();
+  uint32_t MaxRegisterCount = MetadataState.getNumPassedThroughPayloadDwords();
 
   struct VisitorState {
     ModuleMetadataState &Metadata;
@@ -1436,7 +1381,6 @@ void LowerRaytracingPipelinePassImpl::setGpurtEntryRegisterCountMetadata() {
 }
 
 void LowerRaytracingPipelinePassImpl::processContinuations() {
-  TokenTy = StructType::create(*Context, "continuation.token")->getPointerTo();
   I32 = Type::getInt32Ty(*Context);
 
   for (auto &FuncData : ToProcess) {
@@ -1542,7 +1486,7 @@ void LowerRaytracingPipelinePassImpl::prepareAnyHitExits(Function *F, FunctionDa
 
 void LowerRaytracingPipelinePassImpl::processFunctionEnd(FunctionData &Data, FunctionEndData &EData) {
   AnyHitExitKind AHExitKind = AnyHitExitKind::None;
-  bool IsAnyHit = Data.Kind == RayTracingShaderStage::AnyHit;
+  bool IsAnyHit = Data.isAnyHit();
 
   Builder.SetInsertPoint(EData.Terminator);
 
@@ -1594,7 +1538,7 @@ void LowerRaytracingPipelinePassImpl::processFunctionEnd(FunctionData &Data, Fun
     RetValue = Builder.CreateLoad(Data.ReturnTy, SystemData);
   }
 
-  if (Data.Kind == RayTracingShaderStage::RayGeneration) {
+  if (Data.isRayGeneration()) {
     assert(!RetValue && "RayGen cannot return anything");
     if (ExitRayGen)
       handleExitRayGen(Data);
@@ -1608,10 +1552,7 @@ void LowerRaytracingPipelinePassImpl::processFunctionEnd(FunctionData &Data, Fun
 
   SmallVector<Value *> PaddingArgs;
   SmallVector<Value *> TailArgList;
-  if (MetadataState.isInLgcCpsMode()) {
-    // Jump to resume point of caller, pass Poison ShaderIndex as it is not meaningful here.
-    PaddingArgs.push_back(PoisonValue::get(I32));
-  }
+  Value *DummyI32 = PoisonValue::get(I32);
 
   Function *Parent = EData.Terminator->getFunction();
 
@@ -1638,9 +1579,8 @@ void LowerRaytracingPipelinePassImpl::processFunctionEnd(FunctionData &Data, Fun
   PayloadHelper.appendPaddingAndPayloadValues(PaddingArgs, TailArgList, OutgoingRegisterCount,
                                               Data.FirstPayloadArgumentDword, Data.PayloadStorage);
 
-  Value *DummyI32 = PoisonValue::get(I32);
   Instruction *Jump = Builder.create<lgc::cps::JumpOp>(ReturnAddr, getPotentialCpsReturnLevels(Data.Kind), DummyI32,
-                                                       DummyI32, TailArgList);
+                                                       DummyI32, DummyI32, TailArgList);
   Builder.CreateUnreachable();
   EData.Terminator->eraseFromParent();
 
@@ -1679,34 +1619,18 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F, FunctionData
   Type *NewRetTy;
   Type *SystemDataTy = nullptr;
 
-  uint32_t SystemDataArgumentIndex = 0;
-
-  // We always have a return address argument, which must not be included in the padding computation.
-  if (MetadataState.isInLgcCpsMode()) {
-    // Create the CPS function header.
-
-    // A CPS function signature consists of:
-    //  * Return continuation reference (RCR): i32
-    //  * Shader index
-    //  * Remaining arguments (system data, optionally hit attributes)
-    // We need to determine the starting dword of payload storage in arguments,
-    // so that payload starts at a fixed VGPR across all shaders in a pipeline.
-    // The overall layout is:
-    // | returnAddr | shaderIndex | systemData | hitAttrs | padding | payload |
-    // For systemData and hitAttrs, use the max possible sizes for calculation.
+  // Create the CPS function header.
 
-    AllArgTypes.push_back(Builder.getInt32Ty());
-
-    SystemDataArgumentIndex = 2;
-  } else {
-    // The overall layout is:
-    // | returnAddr | systemData | (hitAttrs, remaining args) | padding |
-    // payload
-    // If we don't pass payload, then for stability reasons, we still pass in a
-    // zero- padding and payload-array that remains unused.
-
-    SystemDataArgumentIndex = 1;
-  }
+  // A CPS function signature consists of:
+  //  * Shader index
+  //  * Return continuation reference (RCR): i32
+  //  * Remaining arguments (system data, optionally hit attributes)
+  // We need to determine the starting dword of payload storage in arguments,
+  // so that payload starts at a fixed VGPR across all shaders in a pipeline.
+  // The overall layout is:
+  // | shaderIndex | returnAddr | systemData | hitAttrs | padding | payload |
+  // For systemData and hitAttrs, use the max possible sizes for calculation.
+  // We always have return address and shader index arguments, which must not be included in the padding computation.
 
   // If the value is not computed in the switch case, it will be re-computed
   // based on the incoming serialization layout info.
@@ -1724,7 +1648,7 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F, FunctionData
     SystemDataTy = TraversalDataTy;
     AllArgTypes.push_back(SystemDataTy);
     NewRetTy = SystemDataTy;
-    Data.NumPassedThroughPayloadDwords = MetadataState.getMaxPayloadRegisterCount();
+    Data.NumPassedThroughPayloadDwords = getUpperBoundOnTraceRayPayloadRegisters();
     break;
   }
   case RayTracingShaderStage::AnyHit: {
@@ -1758,9 +1682,8 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F, FunctionData
     AllArgTypes.push_back(SystemDataTy);
     NewRetTy = SystemDataTy;
 
-    Data.NumPassedThroughPayloadDwords = MetadataState.maxUsedPayloadRegisterCountWasSet()
-                                             ? MetadataState.getMaxUsedPayloadRegisterCount()
-                                             : MetadataState.getMaxPayloadRegisterCount();
+    Data.NumPassedThroughPayloadDwords = MetadataState.getNumPassedThroughPayloadDwords();
+
     break;
   }
   default:
@@ -1781,12 +1704,12 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F, FunctionData
   assert(NumIncomingPayloadDwords.has_value());
 
   Data.PayloadStorageTy = PayloadHelper.getPayloadStorageTy(MetadataState.getMaxPayloadRegisterCount(), Data);
-  Data.FirstPayloadArgumentDword = PayloadHelper.getPayloadStartDword(Data, MetadataState.getMaxHitAttributeByteCount(),
-                                                                      TraversalDataTy, MetadataState.isInLgcCpsMode());
+  Data.FirstPayloadArgumentDword =
+      PayloadHelper.getPayloadStartDword(Data, MetadataState.getMaxHitAttributeByteCount(), TraversalDataTy);
 
   const bool HasPayloadArgument = Data.Kind != RayTracingShaderStage::RayGeneration;
   if (HasPayloadArgument) {
-    if (MetadataState.isInLgcCpsMode() && Data.Kind != RayTracingShaderStage::AnyHit) {
+    if (Data.Kind != RayTracingShaderStage::AnyHit) {
       // Add a dummy argument for CpsArgIdx::HitAttributes so that the arg index
       // of payload matches CpsArgIdx::Payload
       AllArgTypes.push_back(StructType::get(*Context, {}));
@@ -1796,11 +1719,8 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F, FunctionData
                                                  Data.FirstPayloadArgumentDword);
   }
 
-  // Pass in the return address argument
-  {
-    const uint32_t RetAddrPos = MetadataState.isInLgcCpsMode() ? 1 : 0;
-    AllArgTypes.insert(AllArgTypes.begin() + RetAddrPos, Builder.getInt32Ty());
-  }
+  // Pass in the shader index and return address arguments so they don't get included in the padding.
+  AllArgTypes.insert(AllArgTypes.begin(), {Builder.getInt32Ty(), Builder.getInt32Ty()});
 
   Data.PayloadSpillSize =
       computePayloadSpillSize(Data.MaxOutgoingPayloadI32s, MetadataState.getMaxPayloadRegisterCount());
@@ -1818,40 +1738,34 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F, FunctionData
   llvm::moveFunctionBody(*F, *NewFunc);
 
   Data.SystemDataTy = cast<StructType>(SystemDataTy);
-  processFunctionEntry(Data, NewFunc->getArg(SystemDataArgumentIndex));
+  processFunctionEntry(Data, NewFunc->getArg(CpsArgIdx::SystemData));
 
   if (MetadataState.isInLgcCpsMode()) {
-    NewFunc->getArg(CpsArgIdx::ShaderIndex)->setName("shader.index");
-
     // Mark as CPS function with the corresponding level.
     CpsLevel Level = getCpsLevelForShaderStage(Data.Kind);
     setCpsFunctionLevel(*NewFunc, Level);
   }
 
   if (Data.Kind != RayTracingShaderStage::RayGeneration) {
-    if (MetadataState.isInLgcCpsMode()) {
-      NewFunc->getArg(CpsArgIdx::SystemData)->setName("system.data");
-      NewFunc->getArg(CpsArgIdx::HitAttributes)->setName("hit.attrs");
-    }
-
-    NewFunc->getArg(NewFunc->arg_size() - 2)->setName("padding");
-    NewFunc->getArg(NewFunc->arg_size() - 1)->setName("payload");
+    NewFunc->getArg(CpsArgIdx::SystemData)->setName("system.data");
+    NewFunc->getArg(CpsArgIdx::HitAttributes)->setName("hit.attrs");
+    NewFunc->getArg(CpsArgIdx::Padding)->setName("padding");
+    NewFunc->getArg(CpsArgIdx::Payload)->setName("payload");
   }
 
   Value *NewSystemData = nullptr;
-  const bool IsTraversal = Data.Kind == RayTracingShaderStage::Traversal;
+  const bool IsTraversal = Data.isTraversal();
   if (IsTraversal) {
     assert(F->arg_size() == 1);
     if (MetadataState.isInLgcCpsMode()) {
       // System data
       // NOTE: Pointer address space may not match based on data layout, mutate
       // the address space here to keep later GEP valid.
-      Data.SystemData->mutateType(
-          getWithSamePointeeType(Data.SystemData->getType(), F->getArg(0)->getType()->getPointerAddressSpace()));
+      Data.SystemData->mutateType(F->getArg(0)->getType());
       NewSystemData = Data.SystemData;
     } else {
       // Replace old system data argument with cloned functions' argument
-      NewSystemData = NewFunc->getArg(1);
+      NewSystemData = NewFunc->getArg(CpsArgIdx::SystemData);
     }
   }
 
@@ -1866,10 +1780,11 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F, FunctionData
   if (NewSystemData)
     F->getArg(0)->replaceAllUsesWith(NewSystemData);
 
+  NewFunc->getArg(CpsArgIdx::ShaderIndex)->setName("shaderIndex");
   NewFunc->getArg(CpsArgIdx::ReturnAddr)->setName("returnAddr");
 
   FunctionEndData EData;
-  if (Data.Kind == RayTracingShaderStage::RayGeneration) {
+  if (Data.isRayGeneration()) {
     if (!MetadataState.isInLgcCpsMode())
       NewFunc->setMetadata(ContHelper::MDEntryName, MDTuple::get(*Context, {}));
 
@@ -1890,7 +1805,7 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F, FunctionData
     // Check that our assumptions about the number of required payload registers
     // are correct. We exclude callable shaders because the max payload size
     // doesn't apply to them.
-    assert((Data.Kind == RayTracingShaderStage::Callable || SerializationInfo == nullptr ||
+    assert((Data.isCallable() || SerializationInfo == nullptr ||
             std::min(MetadataState.getMaxPayloadRegisterCount(), SerializationInfo->MaxStorageI32s) <=
                 getUpperBoundOnTraceRayPayloadRegisters()) &&
            "Payload serialization layout uses too many registers!");
@@ -1945,7 +1860,7 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F, FunctionData
       }
 
       // Handle hit attributes
-      if (Data.Kind == RayTracingShaderStage::AnyHit) {
+      if (Data.isAnyHit()) {
         assert(F->arg_size() == 2 && "Shader has more arguments than expected");
         auto *HitAttrs = F->getArg(1);
 
@@ -1965,13 +1880,10 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F, FunctionData
         copyHitAttributes(Data, Data.SystemData, Data.SystemDataTy, OrigHitAttrsAlloca, true,
                           &IncomingSerializationLayout);
 
-        // Copy new hit attributes from argument:
-        // Since the argument list of NewFunc ends with padding and payload,
-        // subtract 3 to get the hit attributes.
-        unsigned HitAttributesIdx = MetadataState.isInLgcCpsMode() ? CpsArgIdx::HitAttributes : NewFunc->arg_size() - 3;
-        Builder.CreateStore(NewFunc->getArg(HitAttributesIdx), HitAttrsAlloca);
+        // Copy new hit attributes from argument
+        Builder.CreateStore(NewFunc->getArg(CpsArgIdx::HitAttributes), HitAttrsAlloca);
         HitAttrs->replaceAllUsesWith(HitAttrsAlloca);
-      } else if (Data.Kind == RayTracingShaderStage::ClosestHit) {
+      } else if (Data.isClosestHit()) {
         assert(F->arg_size() == 2 && "Shader has more arguments than expected");
         auto *OrigHitAttrs = F->getArg(1);
 
@@ -1989,7 +1901,7 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F, FunctionData
         OrigHitAttrs->replaceAllUsesWith(NewHitAttrs);
         copyHitAttributes(Data, Data.SystemData, Data.SystemDataTy, NewHitAttrs, true, &IncomingSerializationLayout);
       }
-    } else if (Data.Kind == RayTracingShaderStage::Intersection) {
+    } else if (Data.isIntersection()) {
       // Annotate intersection shader with the maximum number of registers
       // used for payload
       // TODO: When compiling a pipeline and not a library, we could figure
@@ -1997,10 +1909,10 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F, FunctionData
       //       use that instead. For a library compile, we can't know the
       //       max payload size of shaders in pipelines this shader is used
       //       in.
-      ContHelper::IncomingRegisterCount::setValue(NewFunc, MetadataState.getMaxPayloadRegisterCount());
+      ContHelper::IncomingRegisterCount::setValue(NewFunc, Data.NumPassedThroughPayloadDwords.value());
       // Intentionally do NOT update MaxUsedPayloadRegisterCount
     } else {
-      assert(Data.Kind == RayTracingShaderStage::Traversal);
+      assert(Data.isTraversal());
       // Intentionally do nothing for Traversal. We explicitly add Traversal
       // register count metadata elsewhere.
     }
@@ -2014,10 +1926,10 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F, FunctionData
   }
   Data.ReturnTy = NewRetTy;
 
-  if (Data.Kind == RayTracingShaderStage::AnyHit)
+  if (Data.isAnyHit())
     prepareAnyHitExits(NewFunc, Data);
 
-  if (Data.Kind == RayTracingShaderStage::Traversal) {
+  if (Data.isTraversal()) {
     PayloadHelper.patchJumpCalls(NewFunc, Data.JumpCalls, Data.FirstPayloadArgumentDword,
                                  Data.NumPassedThroughPayloadDwords, Data.PayloadStorage);
   }
@@ -2050,10 +1962,6 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F, FunctionData
     replaceCall(Data, Call, CallShader, ContinuationCallType::CallShader);
   }
 
-  // Replace ShaderIndexOp calls
-  for (auto *Call : Data.ShaderIndexCalls)
-    replaceShaderIndexCall(Data, Call);
-
   // Replace ShaderRecordBufferOp calls
   for (auto *Call : Data.ShaderRecordBufferCalls) {
     Builder.SetInsertPoint(&*++Call->getIterator());
@@ -2081,6 +1989,20 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F, FunctionData
     }
   }
 
+  // Lower lgc.rt.shader.index to the shader index argument or 0.
+  static const auto ShaderIndexVisitor =
+      llvm_dialects::VisitorBuilder<SmallVector<ShaderIndexOp *>>()
+          .add<lgc::rt::ShaderIndexOp>([](SmallVector<ShaderIndexOp *> &ShaderIndexCalls, lgc::rt::ShaderIndexOp &Op) {
+            ShaderIndexCalls.push_back(&Op);
+          })
+          .build();
+
+  SmallVector<lgc::rt::ShaderIndexOp *> ShaderIndexCalls;
+  ShaderIndexVisitor.visit(ShaderIndexCalls, *F);
+
+  for (auto *Call : ShaderIndexCalls)
+    replaceShaderIndexCall(Data, Call);
+
 #ifndef NDEBUG
   if (Data.Kind != RayTracingShaderStage::RayGeneration) {
     // Check that all returns have registercount metadata
@@ -2088,7 +2010,7 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F, FunctionData
       auto *Terminator = BB.getTerminator();
       if (Terminator->getOpcode() == Instruction::Ret) {
         // Traversal needs to end with jumps + unreachable
-        if (Data.Kind == RayTracingShaderStage::Traversal)
+        if (Data.isTraversal())
           report_fatal_error("Disallowed return found in Traversal, all code paths need to end with an Enqueue");
         else if (!ContHelper::OutgoingRegisterCount::tryGetValue(Terminator))
           report_fatal_error("Missing registercount metadata!");
@@ -2207,15 +2129,7 @@ void LowerRaytracingPipelinePassImpl::collectProcessableFunctions() {
 void LowerRaytracingPipelinePassImpl::handleAmdInternalFunc(Function &Func) {
   StringRef FuncName = Func.getName();
 
-  if (FuncName.starts_with("_AmdRestoreSystemData")) {
-    assert(Func.arg_size() == 1
-           // Function address
-           && Func.getFunctionType()->getParamType(0)->isPointerTy());
-    llvm::forEachCall(Func, [&](llvm::CallInst &CInst) {
-      Builder.SetInsertPoint(&CInst);
-      handleRestoreSystemData(&CInst);
-    });
-  } else if (FuncName.starts_with("_AmdGetFuncAddr")) {
+  if (FuncName.starts_with("_AmdGetFuncAddr")) {
     ContHelper::handleGetFuncAddr(Func, Builder);
   } else if (FuncName.starts_with("_AmdGetShaderKind")) {
     handleGetShaderKind(Func);
@@ -2224,26 +2138,6 @@ void LowerRaytracingPipelinePassImpl::handleAmdInternalFunc(Function &Func) {
   }
 }
 
-// Split BB after _AmdRestoreSystemData.
-// The coroutine passes rematerialize to the start of the basic block of a use.
-// We split the block so that every rematerialized dxil intrinsic lands after
-// the restore call and accesses the restored system data.
-// If we did not do that, an intrinsic that is rematerialized to before
-// RestoreSystemData is called gets an uninitialized system data struct as
-// argument.
-void LowerRaytracingPipelinePassImpl::splitRestoreBB() {
-  for (auto &F : *Mod) {
-    if (F.getName().starts_with("_AmdRestoreSystemData")) {
-      llvm::forEachCall(F, [](llvm::CallInst &CInst) {
-        auto *Next = &*++CInst.getIterator();
-        CInst.eraseFromParent();
-        if (!Next->isTerminator())
-          SplitBlock(Next->getParent(), Next);
-      });
-    }
-  }
-}
-
 // Search for known intrinsics that cannot be rematerialized
 void LowerRaytracingPipelinePassImpl::handleUnrematerializableCandidates() {
   for (auto &Func : *Mod) {
@@ -2291,16 +2185,6 @@ void LowerRaytracingPipelinePassImpl::collectGpuRtFunctions() {
            && (SetTriangleHitAttributes->getFunctionType()->getParamType(1)->isStructTy() ||
                SetTriangleHitAttributes->getFunctionType()->getParamType(1)->isPointerTy()));
 
-  GetLocalRootIndex = GpurtLibrary->getFunction(ContDriverFunc::GetLocalRootIndexName);
-
-  assert(GetLocalRootIndex && "Could not find GetLocalRootIndex function");
-  assert(GetLocalRootIndex->getReturnType()->isIntegerTy(32) &&
-         GetLocalRootIndex->arg_size() == 1
-         // Dispatch data
-         && GetLocalRootIndex->getFunctionType()->getParamType(0)->isPointerTy());
-
-  SetLocalRootIndex = getSetLocalRootIndex(*Mod);
-
   ExitRayGen = GpurtLibrary->getFunction(ContDriverFunc::ExitRayGenName);
   if (ExitRayGen)
     assert(ExitRayGen->getReturnType()->isVoidTy() && ExitRayGen->arg_size() == 1 &&
@@ -2357,7 +2241,7 @@ void LowerRaytracingPipelinePassImpl::collectGpuRtFunctions() {
 }
 
 void LowerRaytracingPipelinePassImpl::determineDispatchSystemDataType() {
-  Function *DispatchRaysIndex = GpurtLibrary->getFunction("_cont_DispatchRaysIndex3");
+  Function *DispatchRaysIndex = GpurtLibrary->getFunction(ContDriverFunc::DispatchRaysIndex3Name);
   assert(DispatchRaysIndex &&
          "LowerRaytracingPipelinePassImpl::determineDispatchSystemDataType: Could not find _cont_DispatchRaysIndex3!");
 
@@ -2387,56 +2271,50 @@ PreservedAnalyses LowerRaytracingPipelinePassImpl::run() {
 
   static const auto Visitor =
       llvm_dialects::VisitorBuilder<VisitorState>()
-          .addSet<TraceRayOp, CallCallableShaderOp, ReportHitOp, ShaderIndexOp, ShaderRecordBufferOp, JumpOp>(
-              [](VisitorState &State, Instruction &Op) {
-                auto *CInst = cast<CallInst>(&Op);
-                auto Data = State.Processables.find(CInst->getFunction());
-                if (Data == State.Processables.end())
-                  return;
-
-                if (isa<ShaderIndexOp>(Op)) {
-                  Data->second.ShaderIndexCalls.push_back(CInst);
-                  return;
-                }
-
-                if (isa<ShaderRecordBufferOp>(Op)) {
-                  Data->second.ShaderRecordBufferCalls.push_back(CInst);
-                  return;
-                }
-
-                if (auto *Jump = dyn_cast<JumpOp>(CInst)) {
-                  Data->second.JumpCalls.push_back(Jump);
-                  return;
-                }
-
-                Type *PayloadTy = ContHelper::getPayloadTypeFromMetadata(*CInst);
-
-                if (!isa<ReportHitOp>(Op)) {
-                  PAQPayloadConfig PAQPayload = {PayloadTy, State.Metadata.getMaxHitAttributeByteCount()};
-
-                  uint32_t PayloadStorageI32s = 0;
-                  if (isa<TraceRayOp>(Op)) {
-                    PayloadStorageI32s = State.PAQManager.getMaxPayloadStorageI32sForTraceRayFunc(PAQPayload);
-
-                    Data->second.TraceRayCalls.push_back(CInst);
-                  } else if (isa<CallCallableShaderOp>(Op)) {
-                    PayloadStorageI32s = State.PAQManager.getMaxPayloadStorageI32sForCallShaderFunc(PAQPayload);
-
-                    Data->second.CallShaderCalls.push_back(CInst);
-                  }
-
-                  Data->second.MaxOutgoingPayloadI32s =
-                      std::max(Data->second.MaxOutgoingPayloadI32s, PayloadStorageI32s);
-                } else {
-                  // The converter uses payload type metadata also to indicate hit
-                  // attribute types
-                  assert((!Data->second.HitAttributes || Data->second.HitAttributes == PayloadTy) &&
-                         "Multiple reportHit calls with different hit attributes");
-                  Data->second.HitAttributes = PayloadTy;
-
-                  Data->second.ReportHitCalls.push_back(CInst);
-                }
-              })
+          .addSet<TraceRayOp, CallCallableShaderOp, ReportHitOp, ShaderRecordBufferOp, JumpOp>([](VisitorState &State,
+                                                                                                  Instruction &Op) {
+            auto *CInst = cast<CallInst>(&Op);
+            auto Data = State.Processables.find(CInst->getFunction());
+            if (Data == State.Processables.end())
+              return;
+
+            if (isa<ShaderRecordBufferOp>(Op)) {
+              Data->second.ShaderRecordBufferCalls.push_back(CInst);
+              return;
+            }
+
+            if (auto *Jump = dyn_cast<JumpOp>(CInst)) {
+              Data->second.JumpCalls.push_back(Jump);
+              return;
+            }
+
+            Type *PayloadTy = ContHelper::getPayloadTypeFromMetadata(*CInst);
+
+            if (!isa<ReportHitOp>(Op)) {
+              PAQPayloadConfig PAQPayload = {PayloadTy, State.Metadata.getMaxHitAttributeByteCount()};
+
+              uint32_t PayloadStorageI32s = 0;
+              if (isa<TraceRayOp>(Op)) {
+                PayloadStorageI32s = State.PAQManager.getMaxPayloadStorageI32sForTraceRayFunc(PAQPayload);
+
+                Data->second.TraceRayCalls.push_back(CInst);
+              } else if (isa<CallCallableShaderOp>(Op)) {
+                PayloadStorageI32s = State.PAQManager.getMaxPayloadStorageI32sForCallShaderFunc(PAQPayload);
+
+                Data->second.CallShaderCalls.push_back(CInst);
+              }
+
+              Data->second.MaxOutgoingPayloadI32s = std::max(Data->second.MaxOutgoingPayloadI32s, PayloadStorageI32s);
+            } else {
+              // The converter uses payload type metadata also to indicate hit
+              // attribute types
+              assert((!Data->second.HitAttributes || Data->second.HitAttributes == PayloadTy) &&
+                     "Multiple reportHit calls with different hit attributes");
+              Data->second.HitAttributes = PayloadTy;
+
+              Data->second.ReportHitCalls.push_back(CInst);
+            }
+          })
           .build();
 
   VisitorState S{PAQManager, ToProcess, MetadataState};
@@ -2465,8 +2343,6 @@ PreservedAnalyses LowerRaytracingPipelinePassImpl::run() {
     }
   }
 
-  splitRestoreBB();
-
   if (Mod == GpurtLibrary) {
     // For tests, remove intrinsic implementations from the module
     for (auto &F : make_early_inc_range(*Mod)) {
diff --git a/llvmraytracing/lib/PipelineState.cpp b/llvmraytracing/lib/PipelineState.cpp
index b69c0d4f89..f617239065 100644
--- a/llvmraytracing/lib/PipelineState.cpp
+++ b/llvmraytracing/lib/PipelineState.cpp
@@ -127,4 +127,16 @@ void PipelineState::merge(const PipelineState &Other) {
   SDSState.merge(Other.SDSState);
 }
 
+void PipelineState::print(llvm::raw_ostream &OS) const {
+  OS << "PipelineState { MaxUsedPayloadRegisterCount=" << MaxUsedPayloadRegisterCount << ", SDSState=";
+  SDSState.print(OS);
+  OS << " }\n";
+}
+
+#ifndef NDEBUG
+void PipelineState::dump() const {
+  print(dbgs());
+}
+#endif
+
 } // namespace llvmraytracing
diff --git a/llvmraytracing/lib/SpecializeDriverShaders.cpp b/llvmraytracing/lib/SpecializeDriverShaders.cpp
index b625b41d32..4e31c35457 100644
--- a/llvmraytracing/lib/SpecializeDriverShaders.cpp
+++ b/llvmraytracing/lib/SpecializeDriverShaders.cpp
@@ -26,7 +26,6 @@
 //===- SpecializeDriverShaders.cpp - Specialize driver shaders based on full-pipeline knowledge -------------------===//
 
 #include "llvmraytracing/SpecializeDriverShaders.h"
-#include "compilerutils/CompilerUtils.h"
 #include "compilerutils/ValueOriginTracking.h"
 #include "compilerutils/ValueSpecialization.h"
 #include "llvmraytracing/ContinuationsUtil.h"
@@ -36,6 +35,7 @@
 #include "llvm-dialects/Dialect/Visitor.h"
 #include "llvm/BinaryFormat/MsgPackDocument.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/ErrorHandling.h"
 #include <cassert>
 
 using namespace llvm;
@@ -101,6 +101,31 @@ Metadata *getI32MDConstant(LLVMContext &Context, uint32_t Value) {
 
 } // namespace MDHelper
 
+namespace FirstRelevantArgIdx {
+// Ignore: shaderRecIdx, returnAddr
+constexpr static unsigned Incoming = 2;
+
+// Ignore: shaderAddr, levels, csp, shaderRecIdx, returnAddr
+constexpr static unsigned JumpArg = 5;
+
+// Ignore: shaderAddr, levels, shaderRecIdx, returnAddr
+constexpr static unsigned AwaitArg = 4;
+
+// Ignore: shaderRecIdx, returnAddr
+constexpr static unsigned AwaitResult = 2;
+
+static unsigned getForJumpOrAwait(CallInst *JumpOrAwait) {
+  if (isa<lgc::cps::JumpOp>(JumpOrAwait))
+    return JumpArg;
+
+  if (isa<lgc::cps::AwaitOp>(JumpOrAwait))
+    return AwaitArg;
+
+  report_fatal_error("getForJumpOrAwait: Should not be called on a CallInst "
+                     "that is not a lgc.cps.jump or lgc.cps.await!");
+}
+} // namespace FirstRelevantArgIdx
+
 // Utilities to keep track of the "status" of individual arg slots.
 // There is some similarity between these pairs of types:
 //  * ArgSlotStatus and ValueTracking::SliceStatus
@@ -461,24 +486,17 @@ class ArgumentLayoutInfo {
   }
 };
 
-// Stores an outgoing jump, together with the first outgoing argument that should be considered.
-struct JumpInfo {
-  CallInst *Outgoing = nullptr;
-  unsigned FirstRelevantOutgoingArgIdx = 0;
-};
+struct FunctionData {
+  lgc::rt::RayTracingShaderStage Stage = lgc::rt::RayTracingShaderStage::Count;
+  bool IsDuringTraversal = false;
+
+  // Stores an outgoing jump.
+  SmallVector<lgc::cps::JumpOp *> Jumps;
 
-struct AwaitInfo : public JumpInfo {
   // Handle lgc.cps.await.
   // lgc.cps uses a single await call, like:
   //   %result = call @lgc.cps.await(i32 %target, i32 %levels, args...)
-  CallInst *AwaitedResult = nullptr;
-};
-
-struct FunctionData {
-  lgc::rt::RayTracingShaderStage Stage = lgc::rt::RayTracingShaderStage::Count;
-  bool IsDuringTraversal = false;
-  SmallVector<JumpInfo> Jumps;
-  SmallVector<AwaitInfo> Awaits;
+  SmallVector<lgc::cps::AwaitOp *> Awaits;
 };
 
 struct SpecializeDriverShadersPassImpl {
@@ -499,8 +517,7 @@ struct SpecializeDriverShadersPassImpl {
   SmallVector<Function *> TraversalFunctions;
   Type *I32 = nullptr;
   // When considering incoming function args to be preserved/specialized, ignore this many arguments.
-  unsigned FirstRelevantIncomingArgIdx = -1;
-  unsigned FirstRelevantOutgoingJumpArgIdx = -1;
+
   // Cache for per-type ArgumentLayoutInfos. unique_ptr for stable storage as DenseMap may invalidate iterators.
   DenseMap<Type *, std::unique_ptr<ArgumentLayoutInfo>> ArgLayoutInfos;
 
@@ -509,17 +526,6 @@ struct SpecializeDriverShadersPassImpl {
       : M{M}, DL{M.getDataLayout()}, Opts{Opts}, TraversalArgsInfo{TraversalArgsInfo}, I32{Type::getInt32Ty(
                                                                                            M.getContext())} {
     HadNonTrivialIncomingTraversalArgsInfo = !TraversalArgsInfo.ArgSlots.empty();
-    if (ContHelper::isLgcCpsModule(M)) {
-      // Ignore return addr, shaderRecIdx
-      FirstRelevantIncomingArgIdx = 2;
-      // Ignore: shaderAddr, levels, csp, returnAddr, shaderRecIdx
-      FirstRelevantOutgoingJumpArgIdx = 5;
-    } else {
-      // Ignore returnAddr
-      FirstRelevantIncomingArgIdx = 1;
-      // Ignore: shaderAddr, levels, csp, returnAddr
-      FirstRelevantOutgoingJumpArgIdx = 4;
-    }
   }
 
   PreservedAnalyses run(ModuleAnalysisManager &AnalysisManager) {
@@ -604,18 +610,16 @@ struct SpecializeDriverShadersPassImpl {
 
     static const auto HandleJumpOrAwait = [](State &State, Instruction &Op) {
       Function *F = Op.getFunction();
-      auto *CI = cast<CallInst>(&Op);
       auto *It = State.Self.ToProcess.find(F);
       if (It == State.Self.ToProcess.end())
         return;
 
       FunctionData &Data = It->second;
-      if (isa<lgc::cps::JumpOp>(Op)) {
-        Data.Jumps.push_back({CI, State.Self.FirstRelevantOutgoingJumpArgIdx});
+      if (auto *Jump = dyn_cast<lgc::cps::JumpOp>(&Op)) {
+        Data.Jumps.push_back(Jump);
       } else {
         assert(isa<lgc::cps::AwaitOp>(Op));
-        // ignore: shaderAddr, levels, shaderRecIdx
-        Data.Awaits.push_back({{CI, 3}, CI});
+        Data.Awaits.push_back(cast<lgc::cps::AwaitOp>(&Op));
       }
     };
 
@@ -644,7 +648,7 @@ struct SpecializeDriverShadersPassImpl {
     IncomingArgSlotValuesWithOffsets Result{};
 
     // Collect incoming args
-    for (unsigned ArgIdx = FirstRelevantIncomingArgIdx; ArgIdx < F->arg_size(); ++ArgIdx) {
+    for (unsigned ArgIdx = FirstRelevantArgIdx::Incoming; ArgIdx < F->arg_size(); ++ArgIdx) {
       Value *Arg = F->getArg(ArgIdx);
       const ArgumentLayoutInfo &ArgLayoutInfo = getOrComputeArgumentLayoutInfo(Arg->getType());
 
@@ -676,8 +680,7 @@ struct SpecializeDriverShadersPassImpl {
     // but that doesn't make a difference as the outgoing await is separately analyzed,
     // and non-preserved args are detected when doing that.
     Result.AwaitOriginAssumptions.emplace();
-    for (const auto &AwaitInfo : Data.Awaits) {
-      auto *AwaitResult = AwaitInfo.AwaitedResult;
+    for (const auto &AwaitResult : Data.Awaits) {
       // Await results are expected to be a struct type that wraps the actual args
       // We treat the struct members like incoming function arguments,
       // because await lowering will turn the part after the await into a function that takes exactly
@@ -696,7 +699,7 @@ struct SpecializeDriverShadersPassImpl {
 
       unsigned AccumArgSlot = 0;
       bool Stop = false;
-      for (unsigned ElemIdx = 0; ElemIdx < STy->getNumElements() && !Stop; ++ElemIdx) {
+      for (unsigned ElemIdx = FirstRelevantArgIdx::AwaitResult; ElemIdx < STy->getNumElements() && !Stop; ++ElemIdx) {
         auto *ElemTy = STy->getElementType(ElemIdx);
         unsigned ElementByteOffset = SL->getElementOffset(ElemIdx);
         if (ElementByteOffset % 4 != 0) {
@@ -828,49 +831,51 @@ struct SpecializeDriverShadersPassImpl {
   }
 
 #ifndef NDEBUG
-  // Sort JumpInfos by instruction order in the containing function.
+  // Sort JumpsAndAwaits by instruction order in the containing function.
   // This ensures processing order (and thereby debug output order) matches input IR order for lit tests.
-  void sortByInstructionOrder(SmallVectorImpl<JumpInfo> &JumpInfos) const {
-    if (JumpInfos.empty())
+  void sortByInstructionOrder(SmallVectorImpl<CallInst *> &JumpsAndAwaits) const {
+    if (JumpsAndAwaits.empty())
       return;
-    Function *F = JumpInfos[0].Outgoing->getFunction();
+    Function *F = JumpsAndAwaits[0]->getFunction();
 
-    // Maps instructions to entry indices in JumpInfos
-    SmallDenseMap<const Instruction *, unsigned> JumpToIndex;
-    for (const auto &[Index, JumpInfo] : enumerate(JumpInfos)) {
-      assert(JumpInfo.Outgoing->getFunction() == F);
-      [[maybe_unused]] auto Inserted = JumpToIndex.insert({JumpInfo.Outgoing, Index}).second;
+    // Maps instructions to entry indices in JumpsAndAwaits
+    SmallDenseMap<const Instruction *, unsigned> JumpOrAwaitToIndex;
+    for (const auto &[Index, JumpOrAwait] : enumerate(JumpsAndAwaits)) {
+      assert(JumpOrAwait->getFunction() == F);
+      [[maybe_unused]] auto Inserted = JumpOrAwaitToIndex.insert({JumpOrAwait, Index}).second;
       assert(Inserted);
     }
 
-    SmallVector<JumpInfo> Result;
-    Result.reserve(JumpInfos.size());
+    SmallVector<CallInst *> Result;
+    Result.reserve(JumpsAndAwaits.size());
     for (const auto &BB : *F) {
       for (const auto &Inst : BB) {
-        auto It = JumpToIndex.find(&Inst);
-        if (It != JumpToIndex.end()) {
-          Result.push_back(JumpInfos[It->second]);
-          JumpToIndex.erase(It);
+        auto It = JumpOrAwaitToIndex.find(&Inst);
+        if (It != JumpOrAwaitToIndex.end()) {
+          Result.push_back(JumpsAndAwaits[It->second]);
+          JumpOrAwaitToIndex.erase(It);
         }
       }
     }
-    assert(Result.size() == JumpInfos.size());
+    assert(Result.size() == JumpsAndAwaits.size());
 
-    JumpInfos = std::move(Result);
+    JumpsAndAwaits = std::move(Result);
   }
 #endif
 
-  // Collect and return the set of outgoing jumps/awaits that may be during Traversal.
-  SmallVector<JumpInfo> getRelevantOutgoingJumpsAndAwaits(const FunctionData &Data) const {
-    SmallVector<JumpInfo> JumpsAndAwaits;
+  // Collect and return the set of outgoing jumps/awaits that may be relevant during Traversal.
+  SmallVector<CallInst *> getRelevantOutgoingJumpsAndAwaits(const FunctionData &Data) const {
+    SmallVector<CallInst *> JumpsAndAwaits;
     JumpsAndAwaits.reserve(Data.Jumps.size() + Data.Awaits.size());
-    for (const auto &AwaitInfo : Data.Awaits)
-      JumpsAndAwaits.push_back(AwaitInfo);
+    for (const auto &AwaitOp : Data.Awaits)
+      JumpsAndAwaits.push_back(AwaitOp);
 
     // Ignore jumps in shaders outside of Traversal:
     // These are shader returns, and thus are neither during Traversal, nor entering Traversal.
-    if (Data.IsDuringTraversal)
-      JumpsAndAwaits.append(Data.Jumps);
+    if (Data.IsDuringTraversal) {
+      for (const auto &JumpOp : Data.Jumps)
+        JumpsAndAwaits.push_back(JumpOp);
+    }
 
 #ifndef NDEBUG
     if (M.getNamedMetadata("lgc.rt.specialize.driver.shaders.process.in.instruction.order"))
@@ -884,12 +889,12 @@ struct SpecializeDriverShadersPassImpl {
   // We know that we are going to query the ValueOriginTracker about all arguments passed to all of these
   // jumps and awaits. The value origin analysis is more efficient when done in bulk, so do that here.
   // The later queries will then return cached results.
-  void runValueTrackingAnalysisOnAllOutgoingArgs(ValueOriginTracker &VOT, ArrayRef<JumpInfo> JumpsAndAwaits) {
+  void runValueTrackingAnalysisOnAllOutgoingArgs(ValueOriginTracker &VOT, ArrayRef<CallInst *> JumpsAndAwaits) {
     SmallVector<Value *> OutgoingArgs;
     for (const auto &JumpOrAwait : JumpsAndAwaits) {
-      for (unsigned OutgoingArgIdx = JumpOrAwait.FirstRelevantOutgoingArgIdx;
-           OutgoingArgIdx < JumpOrAwait.Outgoing->arg_size(); ++OutgoingArgIdx) {
-        Value *OutgoingArg = JumpOrAwait.Outgoing->getArgOperand(OutgoingArgIdx);
+      const unsigned StartIdx = FirstRelevantArgIdx::getForJumpOrAwait(JumpOrAwait);
+      for (unsigned OutgoingArgIdx = StartIdx; OutgoingArgIdx < JumpOrAwait->arg_size(); ++OutgoingArgIdx) {
+        Value *OutgoingArg = JumpOrAwait->getArgOperand(OutgoingArgIdx);
         // This might add duplicates, but that's fine.
         OutgoingArgs.push_back(OutgoingArg);
       }
@@ -940,7 +945,8 @@ struct SpecializeDriverShadersPassImpl {
 
     // The summary of preserved/constant outgoing argument infos for this function
     ArgSlotsInfo FuncArgsInfo;
-    for (auto [JumpOrAwait, FirstRelevantArgIdx] : JumpsAndAwaits) {
+    for (auto *JumpOrAwait : JumpsAndAwaits) {
+      const unsigned FirstRelevantArgIdx = FirstRelevantArgIdx::getForJumpOrAwait(JumpOrAwait);
       // The different jump or continue intrinsics have a different amount of "system" arguments that are not
       // actually passed as argument to the jumped-to function, e.g. the function itself, or possibly a wait mask.
       // These system arguments come before the actual arguments, and need to be ignored for the argument
@@ -1068,7 +1074,7 @@ struct SpecializeDriverShadersPassImpl {
     unsigned AccumArgSlotIdx = 0;
     ValueSpecializer VS{*Func->getParent()};
 
-    for (unsigned ArgIdx = FirstRelevantIncomingArgIdx; ArgIdx < Func->arg_size(); ++ArgIdx) {
+    for (unsigned ArgIdx = FirstRelevantArgIdx::Incoming; ArgIdx < Func->arg_size(); ++ArgIdx) {
       Argument *Arg = Func->getArg(ArgIdx);
       const auto &ArgumentLayoutInfo = getOrComputeArgumentLayoutInfo(Arg->getType());
       auto Result = specializeArgument(SpecializationInfo, VS, Arg, ArgumentLayoutInfo, AccumArgSlotIdx);
@@ -1155,6 +1161,8 @@ struct SpecializeDriverShadersState::Impl {
     TraversalArgsInfo = ArgSlotsInfo::combine(TraversalArgsInfo, Other.TraversalArgsInfo);
   }
 
+  void print(llvm::raw_ostream &OS) const { TraversalArgsInfo.print(OS, true); }
+
   bool operator==(const Impl &Other) const { return TraversalArgsInfo == Other.TraversalArgsInfo; }
 };
 
@@ -1268,6 +1276,10 @@ void SpecializeDriverShadersState::merge(SpecializeDriverShadersState const &Oth
   Pimpl->merge(*Other.Pimpl);
 }
 
+void SpecializeDriverShadersState::print(llvm::raw_ostream &OS) const {
+  Pimpl->print(OS);
+}
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 /// SpecializeDriverShadersPass
 llvm::PreservedAnalyses SpecializeDriverShadersPass::run(llvm::Module &Module,
diff --git a/llvmraytracing/plugin/CMakeLists.txt b/llvmraytracing/plugin/CMakeLists.txt
index 75b0d50abd..ea7c12f660 100644
--- a/llvmraytracing/plugin/CMakeLists.txt
+++ b/llvmraytracing/plugin/CMakeLists.txt
@@ -1,3 +1,28 @@
+##
+ #######################################################################################################################
+ #
+ #  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ #
+ #  Permission is hereby granted, free of charge, to any person obtaining a copy
+ #  of this software and associated documentation files (the "Software"), to
+ #  deal in the Software without restriction, including without limitation the
+ #  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ #  sell copies of the Software, and to permit persons to whom the Software is
+ #  furnished to do so, subject to the following conditions:
+ #
+ #  The above copyright notice and this permission notice shall be included in all
+ #  copies or substantial portions of the Software.
+ #
+ #  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ #  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ #  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ #  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ #  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ #  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ #  IN THE SOFTWARE.
+ #
+ #######################################################################################################################
+
 set(LLVM_RAYTRACINGPLUGIN_LINK_INTO_TOOLS ON CACHE BOOL "Link raytracing plugin into tools" FORCE)
 
 add_llvm_pass_plugin(RaytracingPlugin
diff --git a/llvmraytracing/test/CMakeLists.txt b/llvmraytracing/test/CMakeLists.txt
index 4d9bf9ac8d..d0a3231c45 100644
--- a/llvmraytracing/test/CMakeLists.txt
+++ b/llvmraytracing/test/CMakeLists.txt
@@ -1,3 +1,28 @@
+##
+ #######################################################################################################################
+ #
+ #  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ #
+ #  Permission is hereby granted, free of charge, to any person obtaining a copy
+ #  of this software and associated documentation files (the "Software"), to
+ #  deal in the Software without restriction, including without limitation the
+ #  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ #  sell copies of the Software, and to permit persons to whom the Software is
+ #  furnished to do so, subject to the following conditions:
+ #
+ #  The above copyright notice and this permission notice shall be included in all
+ #  copies or substantial portions of the Software.
+ #
+ #  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ #  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ #  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ #  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ #  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ #  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ #  IN THE SOFTWARE.
+ #
+ #######################################################################################################################
+
 set(LLVMRAYTRACING_TEST_DEPENDS opt FileCheck count not)
 add_custom_target(llvmraytracing-test-depends DEPENDS ${LLVMRAYTRACING_TEST_DEPENDS})
 set_target_properties(llvmraytracing-test-depends PROPERTIES FOLDER "Tests")
diff --git a/llvmraytracing/test/dx/cleanup-continuations-malloc.ll b/llvmraytracing/test/dx/cleanup-continuations-malloc.ll
deleted file mode 100644
index b193b2a50e..0000000000
--- a/llvmraytracing/test/dx/cleanup-continuations-malloc.ll
+++ /dev/null
@@ -1,74 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,cleanup-continuations,lint' -S %s --lint-abort-on-error | FileCheck %s
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-declare void @lgc.cps.await__void(...)
-declare ptr @async_fun(i64, i32)
-declare void @lgc.cps.jump(...)
-declare void @lgc.cps.complete()
-
-define <4 x i32> @simple_await(i32 %dummyRet, <4 x i32> %arg) !continuation.registercount !1 {
-; CHECK-LABEL: define void @simple_await(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[DUMMYRET:%.*]], <4 x i32> [[ARG:%.*]]) !continuation.registercount [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] !continuation.stacksize [[META3:![0-9]+]] !continuation.state [[META3]] {
-; CHECK-NEXT:  AllocaSpillBB:
-; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 20
-; CHECK-NEXT:    store i32 [[TMP8]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[TMP7]] to ptr addrspace(21)
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP2]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[ARG]], ptr addrspace(21) [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP7]], 16
-; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(21)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP5]], i32 0
-; CHECK-NEXT:    store i32 [[DUMMYRET]], ptr addrspace(21) [[TMP6]], align 4
-; CHECK-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i32
-; CHECK-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[CALLEE]] to ptr
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @simple_await.resume.0)
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CALLEE]], i32 -1, i32 [[TMP9]], i32 [[TMP1]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
-; CHECK-NEXT:    unreachable
-;
-  %callee = ptrtoint ptr @async_fun to i32
-  call void (...) @lgc.cps.await__void(i32 %callee, i32 3), !continuation.registercount !1, !continuation.returnedRegistercount !1
-  call void (...) @lgc.cps.jump(i32 %dummyRet, i32 -1, i32 poison, i32 poison, <4 x i32> %arg), !continuation.registercount !1
-  unreachable
-}
-
-define void @simple_await_entry(i32 %dummyRet, <4 x i32> %arg, <4 x i32> addrspace(1)* %mem) !continuation.entry !0 !continuation.registercount !1 {
-; CHECK-LABEL: define void @simple_await_entry(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[DUMMYRET:%.*]], <4 x i32> [[ARG:%.*]], ptr addrspace(1) [[MEM:%.*]]) !continuation.registercount [[META1]] !continuation.entry [[META4:![0-9]+]] !continuation [[META5:![0-9]+]] !continuation.stacksize [[META6:![0-9]+]] !continuation.state [[META6]] {
-; CHECK-NEXT:  AllocaSpillBB:
-; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 24
-; CHECK-NEXT:    store i32 [[TMP8]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP7]], 16
-; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(21)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP3]], i32 0
-; CHECK-NEXT:    store ptr addrspace(1) [[MEM]], ptr addrspace(21) [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP7]] to ptr addrspace(21)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP5]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[ARG]], ptr addrspace(21) [[TMP6]], align 4
-; CHECK-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i32
-; CHECK-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[CALLEE]] to ptr
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @simple_await_entry.resume.0)
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CALLEE]], i32 -1, i32 [[TMP9]], i32 [[TMP1]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
-; CHECK-NEXT:    unreachable
-;
-  %callee = ptrtoint ptr @async_fun to i32
-  call void (...) @lgc.cps.await__void(i32 %callee, i32 3), !continuation.registercount !1, !continuation.returnedRegistercount !1
-  store <4 x i32> %arg, <4 x i32> addrspace(1)* %mem
-  call void @lgc.cps.complete(), !continuation.registercount !1
-  unreachable
-}
-
-!continuation.stackAddrspace = !{!2}
-
-!0 = !{}
-!1 = !{i32 0}
-!2 = !{i32 21}
diff --git a/llvmraytracing/test/dx/cleanup-continuations.ll b/llvmraytracing/test/dx/cleanup-continuations.ll
deleted file mode 100644
index 40e70f0ee6..0000000000
--- a/llvmraytracing/test/dx/cleanup-continuations.ll
+++ /dev/null
@@ -1,294 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --version 3
-; RUN: opt --verify-each -passes='cleanup-continuations,lint,continuations-lint' -S %s --lint-abort-on-error | FileCheck %s
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-%continuation.token = type { }
-%await_with_ret_value.Frame = type { i32 }
-%simple_await.Frame = type { i32 }
-%simple_await_entry.Frame = type { }
-%phi_of_cont_state.Frame = type { i32, i32 }
-
-declare %continuation.token* @async_fun()
-declare { i32 } @lgc.ilcps.getReturnValue__i32() #0
-declare void @lgc.cps.complete()
-declare void @lgc.cps.jump(...)
-
-define { i8*, %continuation.token* } @simple_await(i32 %dummyRet, i8* %0) !continuation !0 !continuation.registercount !4 {
-; CHECK-LABEL: define void @simple_await(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[DUMMYRET:%.*]]) !continuation [[META1:![0-9]+]] !continuation.registercount [[META2:![0-9]+]] !continuation.stacksize [[META3:![0-9]+]] !continuation.state [[META3]] {
-; CHECK-NEXT:  AllocaSpillBB:
-; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP4]], 8
-; CHECK-NEXT:    store i32 [[TMP1]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(21)
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP2]], i32 0
-; CHECK-NEXT:    store i32 -1, ptr addrspace(21) [[TMP3]], align 4
-; CHECK-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i32
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @simple_await.resume.0)
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CALLEE]], i32 -1, i32 [[TMP5]], i32 [[TMP0]], i64 2), !continuation.registercount [[META2]], !continuation.returnedRegistercount [[META2]]
-; CHECK-NEXT:    unreachable
-;
-AllocaSpillBB:
-  %FramePtr = bitcast i8* %0 to %simple_await.Frame*
-  %.spill.addr = getelementptr inbounds %simple_await.Frame, %simple_await.Frame* %FramePtr, i32 0, i32 0
-  store i32 -1, i32* %.spill.addr, align 4
-  %callee = ptrtoint ptr @async_fun to i32
-  %tok = call %continuation.token* @async_fun(i32 %callee, i64 1, i64 2), !continuation.registercount !4, !continuation.returnedRegistercount !4
-  %1 = insertvalue { i8*, %continuation.token* } { i8* bitcast ({ i8*, %continuation.token* } (i8*, i1)* @simple_await.resume.0 to i8*), %continuation.token* undef }, %continuation.token* %tok, 1
-  ret { i8*, %continuation.token* } %1
-}
-
-define internal { i8*, %continuation.token* } @simple_await.resume.0(i8* noalias nonnull align 16 dereferenceable(8) %0, i1 %1) !continuation !0 {
-; CHECK-LABEL: define dso_local void @simple_await.resume.0(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]]) !continuation [[META1]] !continuation.registercount [[META2]] {
-; CHECK-NEXT:  entryresume.0:
-; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], -8
-; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(21)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP3]], i32 0
-; CHECK-NEXT:    [[DOTRELOAD:%.*]] = load i32, ptr addrspace(21) [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], -8
-; CHECK-NEXT:    store i32 [[TMP6]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[DOTRELOAD]], i32 -1, i32 [[TMP7]], i32 poison), !continuation.registercount [[META2]]
-; CHECK-NEXT:    unreachable
-;
-entryresume.0:
-  %FramePtr = bitcast i8* %0 to %simple_await.Frame*
-  %vFrame = bitcast %simple_await.Frame* %FramePtr to i8*
-  %.reload.addr = getelementptr inbounds %simple_await.Frame, %simple_await.Frame* %FramePtr, i32 0, i32 0
-  %.reload = load i32, i32* %.reload.addr, align 4
-  call void (...) @lgc.cps.jump(i32 %.reload, i32 -1, i32 poison, i32 poison), !continuation.registercount !4
-  unreachable
-}
-
-define { i8*, %continuation.token* } @simple_await_entry(i32 %dummyRet, i8* %0) !continuation.entry !2 !continuation !3 !continuation.registercount !4 {
-; CHECK-LABEL: define void @simple_await_entry(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[DUMMYRET:%.*]]) !continuation [[META4:![0-9]+]] !continuation.registercount [[META2]] !continuation.entry [[META5:![0-9]+]] !continuation.stacksize [[META3]] !continuation.state [[META3]] {
-; CHECK-NEXT:  AllocaSpillBB:
-; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP2]], 8
-; CHECK-NEXT:    store i32 [[TMP1]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i32
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @simple_await_entry.resume.0)
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CALLEE]], i32 -1, i32 [[TMP3]], i32 [[TMP0]], i64 2), !continuation.registercount [[META2]], !continuation.returnedRegistercount [[META2]]
-; CHECK-NEXT:    unreachable
-;
-AllocaSpillBB:
-  %FramePtr = bitcast i8* %0 to %simple_await_entry.Frame*
-  %callee = ptrtoint ptr @async_fun to i32
-  %tok = call %continuation.token* @async_fun(i32 %callee, i64 1, i64 2), !continuation.registercount !4, !continuation.returnedRegistercount !4
-  %1 = bitcast { i8*, %continuation.token* } (i8*, i1)* @simple_await_entry.resume.0 to i8*
-  %2 = insertvalue { i8*, %continuation.token* } undef, i8* %1, 0
-  %3 = insertvalue { i8*, %continuation.token* } %2, %continuation.token* %tok, 1
-  ret { i8*, %continuation.token* } %3
-}
-
-define internal { i8*, %continuation.token* } @simple_await_entry.resume.0(i8* noalias nonnull align 16 dereferenceable(8) %0, i1 %1) !continuation.entry !2 !continuation !3 {
-; CHECK-LABEL: define dso_local void @simple_await_entry.resume.0(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]]) !continuation [[META4]] !continuation.registercount [[META2]] {
-; CHECK-NEXT:  entryresume.0:
-; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], -8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], -8
-; CHECK-NEXT:    store i32 [[TMP4]], ptr [[CSP]], align 4
-; CHECK-NEXT:    ret void
-;
-entryresume.0:
-  %FramePtr = bitcast i8* %0 to %simple_await_entry.Frame*
-  %vFrame = bitcast %simple_await_entry.Frame* %FramePtr to i8*
-  call void @lgc.cps.complete(), !continuation.registercount !4
-  unreachable
-}
-
-define { i8*, %continuation.token* } @await_with_ret_value(i32 %dummyRet, i8* %0) !continuation !1 !continuation.registercount !4 {
-; CHECK-LABEL: define void @await_with_ret_value(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[DUMMYRET:%.*]]) !continuation [[META6:![0-9]+]] !continuation.registercount [[META2]] !continuation.stacksize [[META3]] !continuation.state [[META3]] {
-; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP5]], 8
-; CHECK-NEXT:    store i32 [[TMP2]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i32 [[TMP5]] to ptr addrspace(21)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP3]], i32 0
-; CHECK-NEXT:    store i64 -1, ptr addrspace(21) [[TMP4]], align 4
-; CHECK-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i32
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @await_with_ret_value.resume.0)
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CALLEE]], i32 -1, i32 [[TMP6]], i32 [[TMP1]], i64 2), !continuation.registercount [[META2]], !continuation.returnedRegistercount [[META2]]
-; CHECK-NEXT:    unreachable
-;
-  %FramePtr = bitcast i8* %0 to %await_with_ret_value.Frame*
-  %.spill.addr = getelementptr inbounds %await_with_ret_value.Frame, %await_with_ret_value.Frame* %FramePtr, i32 0, i32 0
-  store i64 -1, i64* %.spill.addr, align 4
-  %callee = ptrtoint ptr @async_fun to i32
-  %tok = call %continuation.token* @async_fun(i32 %callee, i64 1, i64 2), !continuation.registercount !4, !continuation.returnedRegistercount !4
-  %res = insertvalue { i8*, %continuation.token* } { i8* bitcast ({ i8*, %continuation.token* } (i8*, i1)* @await_with_ret_value.resume.0 to i8*), %continuation.token* undef }, %continuation.token* %tok, 1
-  ret { i8*, %continuation.token* } %res
-}
-
-define internal { i8*, %continuation.token* } @await_with_ret_value.resume.0(i8* noalias nonnull align 16 dereferenceable(8) %0, i1 %1) !continuation !1 {
-; CHECK-LABEL: define dso_local void @await_with_ret_value.resume.0(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) !continuation [[META6]] !continuation.registercount [[META2]] {
-; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], -8
-; CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { i32 } poison, i32 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(21)
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP4]], i32 0
-; CHECK-NEXT:    [[DOTRELOAD:%.*]] = load i32, ptr addrspace(21) [[TMP5]], align 4
-; CHECK-NEXT:    [[RES_2:%.*]] = extractvalue { i32 } [[TMP9]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], -8
-; CHECK-NEXT:    store i32 [[TMP7]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[DOTRELOAD]], i32 -1, i32 [[TMP8]], i32 poison, i32 [[RES_2]]), !continuation.registercount [[META2]]
-; CHECK-NEXT:    unreachable
-;
-  %FramePtr = bitcast i8* %0 to %await_with_ret_value.Frame*
-  %vFrame = bitcast %await_with_ret_value.Frame* %FramePtr to i8*
-  %.reload.addr = getelementptr inbounds %await_with_ret_value.Frame, %await_with_ret_value.Frame* %FramePtr, i32 0, i32 0
-  %.reload = load i32, i32* %.reload.addr, align 4
-  %res = call { i32 } @lgc.ilcps.getReturnValue__i32()
-  %res.2 = extractvalue { i32 } %res, 0
-  call void (...) @lgc.cps.jump(i32 %.reload, i32 -1, i32 poison, i32 poison, i32 %res.2), !continuation.registercount !4
-  unreachable
-}
-
-; unreachables in their own block added by switch case statements should be ignored
-define { i8*, %continuation.token* } @switch_case_unreachable(i32 %dummyRet, i8* %0) !continuation !6 !continuation.registercount !4 {
-; CHECK-LABEL: define void @switch_case_unreachable(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[DUMMYRET:%.*]]) !continuation [[META7:![0-9]+]] !continuation.registercount [[META2]] !continuation.stacksize [[META3]] !continuation.state [[META3]] {
-; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 8
-; CHECK-NEXT:    store i32 [[TMP2]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i32 [[TMP1]] to ptr addrspace(21)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP3]], i32 0
-; CHECK-NEXT:    store i64 -1, ptr addrspace(21) [[TMP4]], align 4
-; CHECK-NEXT:    [[VAL:%.*]] = urem i32 [[DUMMYRET]], 2
-; CHECK-NEXT:    switch i32 [[VAL]], label [[UNREACHABLE:%.*]] [
-; CHECK-NEXT:      i32 0, label [[A:%.*]]
-; CHECK-NEXT:      i32 1, label [[B:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       unreachable:
-; CHECK-NEXT:    unreachable
-; CHECK:       b:
-; CHECK-NEXT:    br label [[A]]
-; CHECK:       a:
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], -8
-; CHECK-NEXT:    store i32 [[TMP6]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[DUMMYRET]], i32 -1, i32 [[TMP7]], i32 poison), !continuation.registercount [[META2]]
-; CHECK-NEXT:    unreachable
-;
-  %FramePtr = bitcast i8* %0 to %await_with_ret_value.Frame*
-  %.spill.addr = getelementptr inbounds %await_with_ret_value.Frame, %await_with_ret_value.Frame* %FramePtr, i32 0, i32 0
-  store i64 -1, i64* %.spill.addr, align 4
-  %val = urem i32 %dummyRet, 2
-  switch i32 %val, label %unreachable [
-  i32 0, label %a
-  i32 1, label %b
-  ]
-
-unreachable:
-  unreachable
-
-b:
-  br label %a
-
-a:
-  call void (...) @lgc.cps.jump(i32 %dummyRet, i32 -1, i32 poison, i32 poison), !continuation.registercount !4
-  unreachable
-}
-
-; Check that phis on the continuation state compile
-define { i8*, %continuation.token* } @phi_of_cont_state(i32 %dummyRet, ptr %FramePtr) !continuation !7 !continuation.registercount !4 {
-; CHECK-LABEL: define void @phi_of_cont_state(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[DUMMYRET:%.*]]) !continuation [[META8:![0-9]+]] !continuation.registercount [[META2]] !continuation.stacksize [[META3]] !continuation.state [[META3]] {
-; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 8
-; CHECK-NEXT:    store i32 [[TMP2]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[COND:%.*]] = trunc i32 [[DUMMYRET]] to i1
-; CHECK-NEXT:    br i1 [[COND]], label [[LA:%.*]], label [[LB:%.*]]
-; CHECK:       la:
-; CHECK-NEXT:    br label [[END:%.*]]
-; CHECK:       lb:
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP1]], 4
-; CHECK-NEXT:    br label [[END]]
-; CHECK:       end:
-; CHECK-NEXT:    [[C_0:%.*]] = phi i32 [ [[TMP1]], [[LA]] ], [ [[TMP3]], [[LB]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[C_0]] to ptr addrspace(21)
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP4]], i32 0
-; CHECK-NEXT:    store i32 -1, ptr addrspace(21) [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], -8
-; CHECK-NEXT:    store i32 [[TMP7]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[DUMMYRET]], i32 -1, i32 [[TMP8]], i32 poison), !continuation.registercount [[META2]]
-; CHECK-NEXT:    unreachable
-;
-  %cond = trunc i32 %dummyRet to i1
-  br i1 %cond, label %la, label %lb
-
-la:
-  %a = getelementptr inbounds %phi_of_cont_state.Frame, ptr %FramePtr, i32 0, i32 0
-  br label %end
-
-lb:
-  %b = getelementptr inbounds %phi_of_cont_state.Frame, ptr %FramePtr, i32 0, i32 1
-  br label %end
-
-end:
-  %c = phi ptr [ %a, %la ], [ %b, %lb ]
-  store i32 -1, ptr %c, align 4
-  call void (...) @lgc.cps.jump(i32 %dummyRet, i32 -1, i32 poison, i32 poison), !continuation.registercount !4
-  unreachable
-}
-
-attributes #0 = { nounwind }
-
-!continuation.stackAddrspace = !{!5}
-
-!0 = !{{ i8*, %continuation.token* } (i8*)* @simple_await}
-!1 = !{{ i8*, %continuation.token* } (i8*)* @await_with_ret_value}
-!2 = !{}
-!3 = !{{ i8*, %continuation.token* } (i8*)* @simple_await_entry}
-!4 = !{i32 0}
-!5 = !{i32 21}
-!6 = !{{ i8*, %continuation.token* } (i8*)* @switch_case_unreachable}
-!7 = !{{ i8*, %continuation.token* } (i8*)* @phi_of_cont_state}
-;.
-; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind willreturn memory(inaccessiblemem: readwrite) }
-; CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind willreturn }
-; CHECK: attributes #[[ATTR3:[0-9]+]] = { nounwind willreturn memory(inaccessiblemem: read) }
-;.
-; CHECK: [[META0:![0-9]+]] = !{i32 21}
-; CHECK: [[META1]] = !{ptr @simple_await}
-; CHECK: [[META2]] = !{i32 0}
-; CHECK: [[META3]] = !{i32 8}
-; CHECK: [[META4]] = !{ptr @simple_await_entry}
-; CHECK: [[META5]] = !{}
-; CHECK: [[META6]] = !{ptr @await_with_ret_value}
-; CHECK: [[META7]] = !{ptr @switch_case_unreachable}
-; CHECK: [[META8]] = !{ptr @phi_of_cont_state}
-;.
diff --git a/llvmraytracing/test/dx/closest-hit.ll b/llvmraytracing/test/dx/closest-hit.ll
deleted file mode 100644
index f6fe484f10..0000000000
--- a/llvmraytracing/test/dx/closest-hit.ll
+++ /dev/null
@@ -1,200 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='dxil-cont-prepare-gpurt-library,lint,dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-%struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
-%struct.SystemData = type { %struct.DispatchSystemData }
-%struct.DispatchSystemData = type { <3 x i32> }
-%struct.TraversalData = type { %struct.SystemData, %struct.HitData, <3 x float>, <3 x float>, float }
-%struct.HitData = type { float, i32 }
-%struct.AnyHitTraversalData = type { %struct.TraversalData, %struct.HitData }
-%struct.RayPayload = type { <2 x float> }
-
-declare i64 @_cont_GetTraversalAddr() #0
-
-declare i32 @_cont_GetContinuationStackAddr() #0
-
-declare !pointeetys !9 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData*) #0
-
-declare !pointeetys !11 void @_cont_SetTriangleHitAttributes(%struct.SystemData*, %struct.BuiltInTriangleIntersectionAttributes) #0
-
-declare !pointeetys !12 i1 @_cont_IsEndSearch(%struct.TraversalData*) #0
-
-declare %struct.DispatchSystemData @_cont_Traversal(%struct.TraversalData) #0
-
-declare %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i32, %struct.AnyHitTraversalData, float, i32) #0
-
-declare !pointeetys !14 %struct.HitData @_cont_GetCandidateState(%struct.AnyHitTraversalData*) #0
-
-declare !pointeetys !16 %struct.HitData @_cont_GetCommittedState(%struct.SystemData*) #0
-
-declare !pointeetys !17 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
-
-define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) #0 !pointeetys !17 {
-; LOWERRAYTRACINGPIPELINE-LABEL: define i32 @_cont_GetLocalRootIndex(
-; LOWERRAYTRACINGPIPELINE-SAME: ptr [[DATA:%.*]]) #[[ATTR0:[0-9]+]] {
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret i32 5
-;
-  ret i32 5
-}
-
-define void @_cont_TraceRay(%struct.DispatchSystemData* %data, i64 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, float %6, float %7, float %8, float %9, float %10, float %11, float %12, float %13) #0 !pointeetys !19 {
-  %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4
-  %sys_data = insertvalue %struct.SystemData undef, %struct.DispatchSystemData %dis_data, 0
-  %trav_data = insertvalue %struct.TraversalData undef, %struct.SystemData %sys_data, 0
-  %newdata = call %struct.DispatchSystemData @_cont_Traversal(%struct.TraversalData %trav_data)
-  store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4
-  ret void
-}
-
-define i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind) #0 !pointeetys !20 {
-  %trav_data = load %struct.AnyHitTraversalData, %struct.AnyHitTraversalData* %data, align 4
-  %newdata = call %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i32 3, %struct.AnyHitTraversalData %trav_data, float %t, i32 %hitKind)
-  store %struct.AnyHitTraversalData %newdata, %struct.AnyHitTraversalData* %data, align 4
-  ret i1 true
-}
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !21 i32 @_cont_DispatchRaysIndex(%struct.DispatchSystemData* nocapture readnone, i32) #1
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !21 i32 @_cont_DispatchRaysDimensions(%struct.DispatchSystemData* nocapture readnone, i32) #1
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !22 float @_cont_WorldRayOrigin(%struct.DispatchSystemData* nocapture readnone, i32) #1
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !22 float @_cont_WorldRayDirection(%struct.DispatchSystemData* nocapture readnone, i32) #1
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !23 float @_cont_RayTMin(%struct.DispatchSystemData* nocapture readnone) #1
-
-; Function Attrs: nounwind memory(read)
-declare !pointeetys !24 float @_cont_RayTCurrent(%struct.DispatchSystemData* nocapture readnone, %struct.HitData*) #2
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !17 i32 @_cont_RayFlags(%struct.DispatchSystemData* nocapture readnone) #1
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !26 i32 @_cont_InstanceIndex(%struct.DispatchSystemData* nocapture readnone, %struct.HitData*) #1
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !26 i32 @_cont_InstanceID(%struct.DispatchSystemData* nocapture readnone, %struct.HitData*) #1
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !26 i32 @_cont_PrimitiveIndex(%struct.DispatchSystemData* nocapture readnone, %struct.HitData*) #1
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !27 float @_cont_ObjectRayOrigin(%struct.DispatchSystemData* nocapture readnone, %struct.HitData*, i32) #1
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !27 float @_cont_ObjectRayDirection(%struct.DispatchSystemData* nocapture readnone, %struct.HitData*, i32) #1
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !28 float @_cont_ObjectToWorld(%struct.DispatchSystemData* nocapture readnone, %struct.HitData*, i32, i32) #1
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !28 float @_cont_WorldToObject(%struct.DispatchSystemData* nocapture readnone, %struct.HitData*, i32, i32) #1
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !29 i32 @_cont_HitKind(%struct.SystemData* nocapture readnone, %struct.HitData*) #1
-
-; Function Attrs: nounwind
-define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct.BuiltInTriangleIntersectionAttributes* nocapture readonly %attr) #3 !pointeetys !30 {
-; LOWERRAYTRACINGPIPELINE-LABEL: define void @ClosestHit(
-; LOWERRAYTRACINGPIPELINE-SAME: i32 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [8 x i32] [[PAYLOAD:%.*]]) #[[ATTR5:[0-9]+]] !lgc.rt.shaderstage [[META13:![0-9]+]] !continuation [[META14:![0-9]+]] !continuation.registercount [[META10:![0-9]+]] {
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [8 x i32], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[HITATTRS:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [8 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_SYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP8]], ptr [[TMP5]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP11]], ptr [[TMP9]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP12]], ptr [[TMP2]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP2]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP10]], ptr [[HITATTRS]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[HITATTRS]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP18]], ptr [[TMP16]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[PTR:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[BARYPTR:%.*]] = getelementptr inbounds [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[HITATTRS]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[BARY:%.*]] = load <2 x float>, ptr [[BARYPTR]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store <2 x float> [[BARY]], ptr [[PTR]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP19]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP22]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP23]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP25]], ptr [[TMP20]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP26]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = load [8 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP27]], [19 x i32] poison, [8 x i32] [[TMP24]]), !continuation.registercount [[META10]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
-;
-  %ptr = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %payload, i32 0, i32 0
-  %baryPtr = getelementptr inbounds %struct.BuiltInTriangleIntersectionAttributes, %struct.BuiltInTriangleIntersectionAttributes* %attr, i32 0, i32 0
-  %bary = load <2 x float>, <2 x float>* %baryPtr, align 4
-  store <2 x float> %bary, <2 x float>* %ptr, align 4
-  ret void
-}
-
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind memory(none) }
-attributes #2 = { nounwind memory(read) }
-attributes #3 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!llvm.ident = !{!0}
-!dx.version = !{!1}
-!dx.valver = !{!1}
-!dx.shaderModel = !{!2}
-!dx.resources = !{!3}
-!dx.typeAnnotations = !{}
-!dx.entryPoints = !{!4, !6}
-
-!0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"}
-!1 = !{i32 1, i32 6}
-!2 = !{!"lib", i32 6, i32 6}
-!3 = !{null, null, null, null}
-!4 = !{null, !"", null, !3, !5}
-!5 = !{i32 0, i64 65536}
-!6 = !{void (%struct.RayPayload*, %struct.BuiltInTriangleIntersectionAttributes*)* @ClosestHit, !"ClosestHit", null, null, !7}
-!7 = !{i32 8, i32 10, i32 5, !8}
-!8 = !{i32 0}
-!9 = !{%struct.SystemData poison}
-!10 = !{i32 0, %struct.SystemData poison}
-!11 = !{%struct.SystemData poison}
-!12 = !{%struct.TraversalData poison}
-!13 = !{i32 0, %struct.TraversalData poison}
-!14 = !{%struct.AnyHitTraversalData poison}
-!15 = !{i32 0, %struct.AnyHitTraversalData poison}
-!16 = !{%struct.SystemData poison}
-!17 = !{%struct.DispatchSystemData poison}
-!18 = !{i32 0, %struct.DispatchSystemData poison}
-!19 = !{%struct.DispatchSystemData poison}
-!20 = !{%struct.AnyHitTraversalData poison}
-!21 = !{%struct.DispatchSystemData poison}
-!22 = !{%struct.DispatchSystemData poison}
-!23 = !{%struct.DispatchSystemData poison}
-!24 = !{null, %struct.DispatchSystemData poison, %struct.HitData poison}
-!25 = !{i32 0, %struct.HitData poison}
-!26 = !{null, %struct.DispatchSystemData poison, %struct.HitData poison}
-!27 = !{null, %struct.DispatchSystemData poison, %struct.HitData poison}
-!28 = !{null, %struct.DispatchSystemData poison, %struct.HitData poison}
-!29 = !{null, %struct.SystemData poison, %struct.HitData poison}
-!30 = !{null, %struct.RayPayload poison, %struct.BuiltInTriangleIntersectionAttributes poison}
-!31 = !{i32 0, %struct.RayPayload poison}
-!32 = !{i32 0, %struct.BuiltInTriangleIntersectionAttributes poison}
diff --git a/llvmraytracing/test/dx/continuation-registercount.ll b/llvmraytracing/test/dx/continuation-registercount.ll
deleted file mode 100644
index 8bb51b44d8..0000000000
--- a/llvmraytracing/test/dx/continuation-registercount.ll
+++ /dev/null
@@ -1,305 +0,0 @@
-; NOTE: Do not autogenerate
-; RUN: grep -v MAX_REG_10 %s | \
-; RUN:    opt --verify-each --report-payload-register-sizes=byjump -passes='dxil-cont-prepare-gpurt-library,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,cleanup-continuations,lint,continuations-stats-report,remove-types-metadata' -S --lint-abort-on-error 2>&1 | \
-; RUN:    FileCheck -check-prefixes=COMMON,MAX30 %s
-;
-; RUN: grep -v MAX_REG_30 %s | \
-; RUN:    opt --verify-each --report-payload-register-sizes=byjump -passes='dxil-cont-prepare-gpurt-library,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,cleanup-continuations,lint,continuations-stats-report,remove-types-metadata' -S --lint-abort-on-error 2>&1 | \
-; RUN:    FileCheck -check-prefixes=COMMON,MAX10 %s
-
-; The order of metadata on functions is non-deterministic, so make two different runs to match both of them.
-; The 'grep' commands filter out a metadata node that reduces the payload register count.
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-%dx.types.Handle = type { i8* }
-%struct.DispatchSystemData = type { i32 }
-%struct.TraversalData = type { %struct.SystemData }
-%struct.SystemData = type { %struct.DispatchSystemData, %struct.BuiltInTriangleIntersectionAttributes }
-%struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
-%struct.AnyHitTraversalData = type { %struct.TraversalData, %struct.HitData }
-%struct.HitData = type { float, i32 }
-%struct.TheirParams = type { [10 x i32] }
-%struct.RayPayload = type { [15 x i32] }
-%struct.PayloadWithI16 = type { i16, i16 }
-%dx.types.ResourceProperties = type { i32, i32 }
-%struct.MyParams = type { [26 x i32] }
-%struct.TheirParams2 = type { [27 x i32] }
-%struct._AmdTraversalResultData = type { %struct._AmdPrimitiveSystemState, <2 x float>, i32 }
-%struct._AmdPrimitiveSystemState = type { float, i32, i32, i32 }
-%struct._AmdSystemData = type { %struct._AmdTraversalResultData }
-%"class.RWTexture2D<vector<float, 4> >" = type { <4 x float> }
-
-@"\01?Scene@@3URaytracingAccelerationStructure@@A" = external constant %dx.types.Handle, align 4
-@"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" = external constant %dx.types.Handle, align 4
-
-; Function Attrs: alwaysinline
-declare i32 @_cont_GetContinuationStackAddr() #0
-
-; Function Attrs: alwaysinline
-declare %struct.DispatchSystemData @_AmdAwaitTraversal(i32, %struct.TraversalData) #0
-
-; Function Attrs: alwaysinline
-declare %struct.DispatchSystemData @_AmdAwaitShader(i32, i32, %struct.DispatchSystemData) #0
-
-; Function Attrs: alwaysinline
-declare %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i32, i32, %struct.AnyHitTraversalData) #0
-
-; Function Attrs: nounwind memory(read)
-declare !pointeetys !24 i32 @_cont_HitKind(%struct.SystemData* nocapture readnone, %struct.HitData*) #1
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !27 void @_AmdRestoreSystemData(%struct.DispatchSystemData*) #2
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !29 void @_AmdRestoreSystemDataAnyHit(%struct.AnyHitTraversalData*) #2
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !29 void @_cont_AcceptHit(%struct.AnyHitTraversalData* nocapture readnone) #2
-
-; Function Attrs: alwaysinline
-declare i1 @opaqueIsEnd() #0
-
-define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !pointeetys !{%struct.DispatchSystemData poison} {
-  ret void
-}
-
-; Function Attrs: alwaysinline
-define i1 @_cont_IsEndSearch(%struct.TraversalData* %data) #0 !pointeetys !31 {
-  %isEnd = call i1 @opaqueIsEnd()
-  ret i1 %isEnd
-}
-
-; Function Attrs: alwaysinline
-define %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData* %data) #0 !pointeetys !33 {
-  %addr = getelementptr %struct.SystemData, %struct.SystemData* %data, i32 0, i32 1
-  %val = load %struct.BuiltInTriangleIntersectionAttributes, %struct.BuiltInTriangleIntersectionAttributes* %addr, align 4
-  ret %struct.BuiltInTriangleIntersectionAttributes %val
-}
-
-; Function Attrs: alwaysinline
-define void @_cont_SetTriangleHitAttributes(%struct.SystemData* %data, %struct.BuiltInTriangleIntersectionAttributes %val) #0 !pointeetys !34 {
-  %addr = getelementptr %struct.SystemData, %struct.SystemData* %data, i32 0, i32 1
-  store %struct.BuiltInTriangleIntersectionAttributes %val, %struct.BuiltInTriangleIntersectionAttributes* %addr, align 4
-  ret void
-}
-
-declare !pointeetys !35 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
-
-; Function Attrs: alwaysinline
-define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) #0 !pointeetys !35 {
-  ret i32 5
-}
-
-; Function Attrs: alwaysinline
-define void @_cont_TraceRay(%struct.DispatchSystemData* %data, i64 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, float %6, float %7, float %8, float %9, float %10, float %11, float %12, float %13) #0 !pointeetys !36 {
-  %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4
-  %sys_data = insertvalue %struct.SystemData undef, %struct.DispatchSystemData %dis_data, 0
-  %trav_data = insertvalue %struct.TraversalData undef, %struct.SystemData %sys_data, 0
-  %newdata = call %struct.DispatchSystemData @_AmdAwaitTraversal(i32 4, %struct.TraversalData %trav_data)
-  store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4
-  call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data)
-  ret void
-}
-
-; Function Attrs: alwaysinline
-define void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) #0 !pointeetys !37 {
-  %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4
-  %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i32 2, i32 poison, %struct.DispatchSystemData %dis_data)
-  store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4
-  call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data)
-  ret void
-}
-
-; Function Attrs: alwaysinline
-define i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind) #0 !pointeetys !38 {
-  %trav_data = load %struct.AnyHitTraversalData, %struct.AnyHitTraversalData* %data, align 4
-  %newdata = call %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i32 3, i32 poison, %struct.AnyHitTraversalData %trav_data)
-  store %struct.AnyHitTraversalData %newdata, %struct.AnyHitTraversalData* %data, align 4
-  call void @_AmdRestoreSystemDataAnyHit(%struct.AnyHitTraversalData* %data)
-  ret i1 true
-}
-
-; COMMON-DAG: Incoming payload VGPR size of "main" (raygeneration): 0 dwords
-; COMMON-DAG: Outgoing payload VGPR size by jump:
-; COMMON-DAG: call void (...) @lgc.cps.jump(i32 2, {{.*}} %struct.DispatchSystemData %{{.*}}: 10 dwords
-
-define void @main() {
-  %params = alloca %struct.TheirParams, align 4
-  call void @dx.op.callShader.struct.TheirParams(i32 159, i32 1, %struct.TheirParams* nonnull %params)
-  ret void
-}
-
-; COMMON-DAG: Incoming payload VGPR size of "mainTrace" (raygeneration): 0 dwords
-; COMMON-DAG: Outgoing payload VGPR size by jump:
-; MAX10-DAG: call void (...) @lgc.cps.jump(i32 4, {{.*}} %struct.TraversalData %{{.*}}: 10 dwords
-; MAX30-DAG: call void (...) @lgc.cps.jump(i32 4, {{.*}} %struct.TraversalData %{{.*}}: 15 dwords
-define void @mainTrace() {
-  %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
-  %2 = load %dx.types.Handle, %dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
-  %3 = alloca %struct.RayPayload, align 4
-  %4 = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %3, i32 0, i32 0
-  %5 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1)
-  %6 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %5, %dx.types.ResourceProperties { i32 16, i32 0 })
-  call void @dx.op.traceRay.struct.RayPayload(i32 157, %dx.types.Handle %6, i32 16, i32 -1, i32 0, i32 1, i32 0, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0x3F50624DE0000000, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+04, %struct.RayPayload* nonnull %3)
-  ret void
-}
-
-; If we set maxPayloadRegisterCount to 10, both functions use only 10 payload registers.
-; MAX10-DAG: Incoming payload VGPR size of "called" (callable): 10 dwords
-; MAX10-DAG: Incoming payload VGPR size of "called.resume.0" (callable): 10 dwords
-; MAX30-DAG: Incoming payload VGPR size of "called" (callable): 26 dwords
-; MAX30-DAG: Incoming payload VGPR size of "called.resume.0" (callable): 27 dwords
-
-define void @called(%struct.MyParams* %arg) !pointeetys !39 {
-  %params = alloca %struct.TheirParams2, align 4
-  call void @dx.op.callShader.struct.TheirParams2(i32 159, i32 2, %struct.TheirParams2* nonnull %params)
-  ret void
-}
-
-; MAX10-DAG: Incoming payload VGPR size of "Intersection" (intersection): 10 dwords
-; MAX30-DAG: Incoming payload VGPR size of "Intersection" (intersection): 30 dwords
-; COMMON-DAG: Outgoing payload VGPR size by jump:
-; MAX10-DAG: call void (...) @lgc.cps.jump(i32 3, {{.*}}: 10 dwords
-; MAX30-DAG: call void (...) @lgc.cps.jump(i32 3, {{.*}}: 30 dwords
-
-; MAX10-DAG: Incoming payload VGPR size of "Intersection.resume.0" (intersection): 10 dwords
-; MAX30-DAG: Incoming payload VGPR size of "Intersection.resume.0" (intersection): 30 dwords
-; COMMON-DAG: Outgoing payload VGPR size by jump:
-; MAX10-DAG: call void (...) @lgc.cps.jump(i32 %returnAddr.reload{{.*}}: 10 dwords
-; MAX30-DAG: call void (...) @lgc.cps.jump(i32 %returnAddr.reload{{.*}}: 30 dwords
-
-define void @Intersection() #3 {
-  %a = alloca %struct.BuiltInTriangleIntersectionAttributes, align 4
-  %b = call i1 @dx.op.reportHit.struct.BuiltInTriangleIntersectionAttributes(i32 158, float 4.000000e+00, i32 0, %struct.BuiltInTriangleIntersectionAttributes* nonnull %a)
-  ret void
-}
-
-; MAX10-DAG: Incoming payload VGPR size of "AnyHit" (anyhit): 10 dwords
-; MAX30-DAG: Incoming payload VGPR size of "AnyHit" (anyhit): 15 dwords
-
-define void @AnyHit(%struct.RayPayload* noalias nocapture %payload, %struct.BuiltInTriangleIntersectionAttributes* nocapture readonly %attr) #3 !pointeetys !41 {
-  ret void
-}
-
-; With fixed hit attribute registers and without PAQs, ClosestHitOut also contains storage for hit attributes
-; MAX10-DAG: Incoming payload VGPR size of "ClosestHit" (closesthit): 10 dwords
-; MAX30-DAG: Incoming payload VGPR size of "ClosestHit" (closesthit): 15 dwords
-
-define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct.AnyHitTraversalData* nocapture readonly %attr) #3 !pointeetys !41 {
-  ret void
-}
-
-; COMMON-DAG: Incoming payload VGPR size of "Miss16" (miss): 1 dwords
-define void @Miss16(%struct.PayloadWithI16* noalias nocapture %payload) !pointeetys !55 {
-  ret void
-}
-
-declare void @_AmdEnqueueAnyHit(i32, i32, %struct._AmdSystemData, <2 x float>) #0
-
-; MAX10-DAG: Incoming payload VGPR size of "_cont_Traversal" (compute): 10 dwords
-; COMMON-DAG: Outgoing payload VGPR size by jump:
-; MAX10-DAG: call {{.*}} @lgc.cps.jump({{.*}}: 10 dwords
-; MAX30-DAG: Incoming payload VGPR size of "_cont_Traversal" (compute): 27 dwords
-; COMMON-DAG: Outgoing payload VGPR size by jump:
-; MAX30-DAG: call {{.*}} @lgc.cps.jump({{.*}}: 27 dwords
-
-define void @_cont_Traversal(%struct._AmdTraversalResultData* noalias nocapture sret(%struct._AmdTraversalResultData) %agg.result, %struct._AmdSystemData* noalias %data) !pointeetys !44 {
-  call void @_AmdEnqueueAnyHit(i32 0, i32 poison, %struct.BuiltInTriangleIntersectionAttributes undef, <2 x float> undef)
-  unreachable
-}
-
-; Function Attrs: nounwind
-declare !pointeetys !47 void @dx.op.traceRay.struct.RayPayload(i32, %dx.types.Handle, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, %struct.RayPayload*) #3
-
-; Function Attrs: nounwind memory(none)
-declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #2
-
-; Function Attrs: nounwind memory(read)
-declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.types.Handle) #1
-
-; Function Attrs: nounwind
-declare !pointeetys !48 void @dx.op.callShader.struct.TheirParams(i32, i32, %struct.TheirParams*) #3
-
-; Function Attrs: nounwind
-declare !pointeetys !50 void @dx.op.callShader.struct.TheirParams2(i32, i32, %struct.TheirParams2*) #3
-
-declare !pointeetys !52 i1 @dx.op.reportHit.struct.BuiltInTriangleIntersectionAttributes(i32, float, i32, %struct.BuiltInTriangleIntersectionAttributes*)
-
-attributes #0 = { alwaysinline }
-attributes #1 = { nounwind memory(read) }
-attributes #2 = { nounwind memory(none) }
-attributes #3 = { nounwind }
-
-!llvm.ident = !{!0}
-!dx.version = !{!1}
-!dx.valver = !{!1}
-!dx.shaderModel = !{!2}
-!dx.entryPoints = !{!3, !6, !13, !15, !17, !19, !21, !57}
-!continuation.maxPayloadRegisterCount = !{!23} ; 10; only for MAX_REG_10
-!continuation.maxPayloadRegisterCount = !{!53} ; 30; only for MAX_REG_30
-!continuation.maxUsedPayloadRegisterCount = !{!23} ; 10; only for MAX_REG_10
-!continuation.maxUsedPayloadRegisterCount = !{!54} ; 27; only for MAX_REG_30
-!lgc.rt.max.attribute.size = !{!60}
-
-!0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"}
-!1 = !{i32 1, i32 6}
-!2 = !{!"lib", i32 6, i32 6}
-!3 = !{null, !"", null, !4, !12}
-!4 = !{!5, !9, null, null}
-!5 = !{!6}
-!6 = !{void ()* @main, !"main", null, null, !7}
-!7 = !{i32 8, i32 7, i32 6, i32 16, i32 7, i32 8, i32 5, !8}
-!8 = !{i32 0}
-!9 = !{!10}
-!10 = !{i32 0, %"class.RWTexture2D<vector<float, 4> >"* bitcast (%dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" to %"class.RWTexture2D<vector<float, 4> >"*), !"RenderTarget", i32 0, i32 0, i32 1, i32 2, i1 false, i1 false, i1 false, !11}
-!11 = !{i32 0, i32 9}
-!12 = !{i32 0, i64 65536}
-!13 = !{void (%struct.MyParams*)* @called, !"called", null, null, !14}
-!14 = !{i32 8, i32 12}
-!15 = !{void ()* @mainTrace, !"mainTrace", null, null, !16}
-!16 = !{i32 8, i32 7}
-!17 = !{void ()* @Intersection, !"Intersection", null, null, !18}
-!18 = !{i32 8, i32 8, i32 5, !8}
-!19 = !{void (%struct.RayPayload*, %struct.BuiltInTriangleIntersectionAttributes*)* @AnyHit, !"AnyHit", null, null, !20}
-!20 = !{i32 8, i32 9, i32 5, !8}
-!21 = !{void (%struct.RayPayload*, %struct.BuiltInTriangleIntersectionAttributes*)* @ClosestHit, !"ClosestHit", null, null, !22}
-!22 = !{i32 8, i32 10, i32 5, !8}
-!23 = !{i32 10}
-!24 = !{null, %struct.SystemData poison, %struct.HitData poison}
-!25 = !{i32 0, %struct.SystemData poison}
-!26 = !{i32 0, %struct.HitData poison}
-!27 = !{%struct.DispatchSystemData poison}
-!28 = !{i32 0, %struct.DispatchSystemData poison}
-!29 = !{%struct.AnyHitTraversalData poison}
-!30 = !{i32 0, %struct.AnyHitTraversalData poison}
-!31 = !{%struct.TraversalData poison}
-!32 = !{i32 0, %struct.TraversalData poison}
-!33 = !{%struct.SystemData poison}
-!34 = !{%struct.SystemData poison}
-!35 = !{%struct.DispatchSystemData poison}
-!36 = !{%struct.DispatchSystemData poison}
-!37 = !{%struct.DispatchSystemData poison}
-!38 = !{%struct.AnyHitTraversalData poison}
-!39 = !{%struct.MyParams poison}
-!40 = !{i32 0, %struct.MyParams poison}
-!41 = !{null, %struct.RayPayload poison, %struct.BuiltInTriangleIntersectionAttributes poison}
-!42 = !{i32 0, %struct.RayPayload poison}
-!43 = !{i32 0, %struct.BuiltInTriangleIntersectionAttributes poison}
-!44 = !{null, %struct._AmdTraversalResultData poison, %struct._AmdSystemData poison}
-!45 = !{i32 0, %struct._AmdTraversalResultData poison}
-!46 = !{i32 0, %struct._AmdSystemData poison}
-!47 = !{%struct.RayPayload poison}
-!48 = !{%struct.TheirParams poison}
-!49 = !{i32 0, %struct.TheirParams poison}
-!50 = !{%struct.TheirParams2 poison}
-!51 = !{i32 0, %struct.TheirParams2 poison}
-!52 = !{%struct.BuiltInTriangleIntersectionAttributes poison}
-!53 = !{i32 30}
-!54 = !{i32 27}
-!55 = !{%struct.PayloadWithI16 poison}
-!56 = !{i32 0, %struct.PayloadWithI16 poison}
-!57 = !{void (%struct.PayloadWithI16*)* @Miss16, !"Miss16", null, null, !58}
-!58 = !{i32 8, i32 11, i32 6, i32 24, i32 5, !59}
-!59 = !{i32 0}
-!60 = !{i32 8}
diff --git a/llvmraytracing/test/dx/continuation-stacksize.ll b/llvmraytracing/test/dx/continuation-stacksize.ll
deleted file mode 100644
index fc84464f9b..0000000000
--- a/llvmraytracing/test/dx/continuation-stacksize.ll
+++ /dev/null
@@ -1,189 +0,0 @@
-; NOTE: Do not autogenerate
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' \
-; RUN:     -S %s --lint-abort-on-error | FileCheck -check-prefix=POSTPROCESS-STACKSIZE %s
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,cleanup-continuations,lint,remove-types-metadata' \
-; RUN:     -S %s --lint-abort-on-error | FileCheck -check-prefix=CLEANUP-STATESIZE %s
-
-; The order of metadata on functions is non-deterministic, so make two different runs to match both of them.
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-%dx.types.Handle = type { i8* }
-%struct.DispatchSystemData = type { i32 }
-%struct.TraversalData = type { %struct.SystemData }
-%struct.SystemData = type { %struct.DispatchSystemData }
-%struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
-%struct.TheirParams = type { [64 x i32] }
-%struct.RayPayload = type { [68 x i32] }
-%dx.types.ResourceProperties = type { i32, i32 }
-%struct.MyParams = type { [48 x i32] }
-%struct.TheirParams2 = type { [65 x i32] }
-%"class.RWTexture2D<vector<float, 4> >" = type { <4 x float> }
-
-@"\01?Scene@@3URaytracingAccelerationStructure@@A" = external constant %dx.types.Handle, align 4
-@"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" = external constant %dx.types.Handle, align 4
-
-; Function Attrs: alwaysinline
-declare i32 @_cont_GetContinuationStackAddr() #0
-
-declare !pointeetys !33 i1 @_cont_ReportHit(%struct.TraversalData* %data, float %t, i32 %hitKind)
-
-; Function Attrs: alwaysinline
-declare %struct.DispatchSystemData @_AmdAwaitTraversal(i32, %struct.TraversalData) #0
-
-; Function Attrs: alwaysinline
-declare %struct.DispatchSystemData @_AmdAwaitShader(i32, i32, %struct.DispatchSystemData) #0
-
-; Function Attrs: alwaysinline
-declare !pointeetys !17 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData*) #0
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !19 void @_AmdRestoreSystemData(%struct.DispatchSystemData*) #1
-
-declare !pointeetys !21 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
-
-define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !pointeetys !{%struct.DispatchSystemData poison} {
-  ret void
-}
-
-; Function Attrs: alwaysinline
-define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) #0 !pointeetys !21 {
-  ret i32 5
-}
-
-; Function Attrs: alwaysinline
-define void @_cont_TraceRay(%struct.DispatchSystemData* %data, i64 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, float %6, float %7, float %8, float %9, float %10, float %11, float %12, float %13) #0 !pointeetys !22 {
-  %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4
-  %sys_data = insertvalue %struct.SystemData undef, %struct.DispatchSystemData %dis_data, 0
-  %trav_data = insertvalue %struct.TraversalData undef, %struct.SystemData %sys_data, 0
-  %newdata = call %struct.DispatchSystemData @_AmdAwaitTraversal(i32 4, %struct.TraversalData %trav_data)
-  store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4
-  ret void
-}
-
-; Function Attrs: alwaysinline
-define void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) #0 !pointeetys !23 {
-  %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4
-  %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i32 2, i32 poison, %struct.DispatchSystemData %dis_data)
-  store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4
-  call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data)
-  ret void
-}
-
-; LOWERRAYTRACINGPIPELINE-STACKSIZE-DAG: define void @main(%struct.DispatchSystemData %0){{.*}} !continuation.stacksize ![[main_stacksize:[0-9]+]]
-; LOWERRAYTRACINGPIPELINE-STACKSIZE-DAG: ![[main_stacksize]] = !{i32 140}
-
-; POSTPROCESS-STACKSIZE-DAG: define void @main({{.*}} !continuation.stacksize ![[main_stacksize:[0-9]+]]
-; POSTPROCESS-STACKSIZE-DAG: ![[main_stacksize]] = !{i32 140}
-; CLEANUP-STATESIZE-DAG: define void @main({{.*}} !continuation.state ![[main_state:[0-9]+]]
-; CLEANUP-STATESIZE-DAG: ![[main_state]] = !{i32 0}
-
-define void @main() {
-  %params = alloca %struct.TheirParams, align 4
-  call void @dx.op.callShader.struct.TheirParams(i32 159, i32 1, %struct.TheirParams* nonnull %params)
-  ret void
-}
-
-; LOWERRAYTRACINGPIPELINE-STACKSIZE-DAG: define void @mainTrace(%struct.DispatchSystemData %0){{.*}} !continuation.stacksize ![[maintrace_stacksize:[0-9]+]]
-; LOWERRAYTRACINGPIPELINE-STACKSIZE-DAG: ![[maintrace_stacksize]] = !{i32 180}
-
-; CLEANUP-STACKSIZE-DAG: define void @mainTrace{{.*}}%struct.DispatchSystemData{{.*}} !continuation.stacksize ![[maintrace_stacksize:[0-9]+]]
-; CLEANUP-STACKSIZE-DAG: ![[maintrace_stacksize]] = !{i32 180}
-; CLEANUP-STATESIZE-DAG: define void @mainTrace{{.*}}%struct.DispatchSystemData{{.*}} !continuation.state ![[main_state]]
-
-; SAVESTATE-STACKSIZE-DAG: define void @mainTrace(%struct.DispatchSystemData %0){{.*}} !continuation.stacksize ![[maintrace_stacksize:[0-9]+]]
-; SAVESTATE-STACKSIZE-DAG: ![[maintrace_stacksize]] = !{i32 180}
-; SAVESTATE-STATESIZE-DAG: define void @mainTrace(%struct.DispatchSystemData %0){{.*}} !continuation.state ![[main_state]]
-
-define void @mainTrace() {
-  %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
-  %2 = load %dx.types.Handle, %dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
-  %3 = alloca %struct.RayPayload, align 4
-  %4 = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %3, i32 0, i32 0
-  %5 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1)
-  %6 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %5, %dx.types.ResourceProperties { i32 16, i32 0 })
-  call void @dx.op.traceRay.struct.RayPayload(i32 157, %dx.types.Handle %6, i32 16, i32 -1, i32 0, i32 1, i32 0, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0x3F50624DE0000000, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+04, %struct.RayPayload* nonnull %3)
-  ret void
-}
-
-; LOWERRAYTRACINGPIPELINE-STACKSIZE-DAG: define %struct.DispatchSystemData @called({{.*}}%struct.DispatchSystemData %0){{.*}} !continuation.stacksize ![[called_stacksize:[0-9]+]]
-; LOWERRAYTRACINGPIPELINE-STACKSIZE-DAG: ![[called_stacksize]] = !{i32 144}
-
-; CLEANUP-STACKSIZE-DAG: define void @called({{.*}}%struct.DispatchSystemData %0){{.*}} !continuation.stacksize ![[called_stacksize:[0-9]+]]
-; CLEANUP-STACKSIZE-DAG: ![[called_stacksize]] = !{i32 344}
-; CLEANUP-STATESIZE-DAG: define void @called{{.*}}%struct.DispatchSystemData{{.*}} !continuation.state ![[called_state:[0-9]+]]
-; CLEANUP-STATESIZE-DAG: ![[called_state]] = !{i32 200}
-
-; SAVESTATE-STACKSIZE-DAG: define void @called({{.*}}%struct.DispatchSystemData %0){{.*}} !continuation.stacksize ![[called_stacksize:[0-9]+]]
-; SAVESTATE-STACKSIZE-DAG: ![[called_stacksize]] = !{i32 348}
-; SAVESTATE-STATESIZE-DAG: define void @called{{.*}}%struct.DispatchSystemData{{.*}} !continuation.state ![[called_state:[0-9]+]]
-; SAVESTATE-STATESIZE-DAG: ![[called_state]] = !{i32 204}
-
-define void @called(%struct.MyParams* %arg) !pointeetys !24 {
-  %params = alloca %struct.TheirParams2, align 4
-  call void @dx.op.callShader.struct.TheirParams2(i32 159, i32 2, %struct.TheirParams2* nonnull %params)
-  ret void
-}
-
-; Function Attrs: nounwind
-declare !pointeetys !26 void @dx.op.traceRay.struct.RayPayload(i32, %dx.types.Handle, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, %struct.RayPayload*) #2
-
-; Function Attrs: nounwind memory(none)
-declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
-
-; Function Attrs: nounwind memory(read)
-declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.types.Handle) #3
-
-; Function Attrs: nounwind
-declare !pointeetys !28 void @dx.op.callShader.struct.TheirParams(i32, i32, %struct.TheirParams*) #2
-
-; Function Attrs: nounwind
-declare !pointeetys !30 void @dx.op.callShader.struct.TheirParams2(i32, i32, %struct.TheirParams2*) #2
-
-attributes #0 = { alwaysinline }
-attributes #1 = { nounwind memory(none) }
-attributes #2 = { nounwind }
-attributes #3 = { nounwind memory(read) }
-
-!llvm.ident = !{!0}
-!dx.version = !{!1}
-!dx.valver = !{!1}
-!dx.shaderModel = !{!2}
-!dx.entryPoints = !{!3, !6, !13, !15}
-!lgc.rt.max.attribute.size = !{!34}
-
-!0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"}
-!1 = !{i32 1, i32 6}
-!2 = !{!"lib", i32 6, i32 6}
-!3 = !{null, !"", null, !4, !12}
-!4 = !{!5, !9, null, null}
-!5 = !{!6}
-!6 = !{void ()* @main, !"main", null, null, !7}
-!7 = !{i32 8, i32 7, i32 6, i32 16, i32 7, i32 8, i32 5, !8}
-!8 = !{i32 0}
-!9 = !{!10}
-!10 = !{i32 0, %"class.RWTexture2D<vector<float, 4> >"* bitcast (%dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" to %"class.RWTexture2D<vector<float, 4> >"*), !"RenderTarget", i32 0, i32 0, i32 1, i32 2, i1 false, i1 false, i1 false, !11}
-!11 = !{i32 0, i32 9}
-!12 = !{i32 0, i64 65536}
-!13 = !{void (%struct.MyParams*)* @called, !"called", null, null, !14}
-!14 = !{i32 8, i32 12}
-!15 = !{void ()* @mainTrace, !"mainTrace", null, null, !16}
-!16 = !{i32 8, i32 7}
-!17 = !{%struct.SystemData poison}
-!18 = !{i32 0, %struct.SystemData poison}
-!19 = !{%struct.DispatchSystemData poison}
-!20 = !{i32 0, %struct.DispatchSystemData poison}
-!21 = !{%struct.DispatchSystemData poison}
-!22 = !{%struct.DispatchSystemData poison}
-!23 = !{%struct.DispatchSystemData poison}
-!24 = !{%struct.MyParams poison}
-!25 = !{i32 0, %struct.MyParams poison}
-!26 = !{%struct.RayPayload poison}
-!27 = !{i32 0, %struct.RayPayload poison}
-!28 = !{%struct.TheirParams poison}
-!29 = !{i32 0, %struct.TheirParams poison}
-!30 = !{%struct.TheirParams2 poison}
-!31 = !{i32 0, %struct.TheirParams2 poison}
-!32 = !{i32 0, %struct.TraversalData poison}
-!33 = !{%struct.TraversalData poison}
-!34 = !{i32 8}
diff --git a/llvmraytracing/test/dx/continuation-state.ll b/llvmraytracing/test/dx/continuation-state.ll
deleted file mode 100644
index f3b23c7997..0000000000
--- a/llvmraytracing/test/dx/continuation-state.ll
+++ /dev/null
@@ -1,121 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
-; RUN: opt --verify-each -passes='lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,cleanup-continuations,lint' -S %s --lint-abort-on-error | FileCheck -check-prefix=CLEANUP %s
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-declare void @lgc.cps.await__void(...)
-declare i32 @_cont_GetContinuationStackAddr()
-declare ptr @async_fun(i32, i32)
-declare void @lgc.cps.jump(...)
-declare void @lgc.cps.complete()
-
-define <4 x i32> @simple_await(i32 %returnAddr, <4 x i32> %arg) !continuation.registercount !1 {
-  %callee = ptrtoint ptr @async_fun to i32
-  call void (...) @lgc.cps.await__void(i32 %callee, i32 3), !continuation.registercount !1, !continuation.returnedRegistercount !1
-  call void (...) @lgc.cps.jump(i32 %returnAddr, i32 -1, i32 poison, i32 poison, <4 x i32> %arg), !continuation.registercount !1
-  unreachable
-}
-
-define void @simple_await_entry(i32 %returnAddr, <4 x i32> %arg, <4 x i32> addrspace(1)* %mem) !continuation.entry !0 !continuation.registercount !1 {
-  %callee = ptrtoint ptr @async_fun to i32
-  call void (...) @lgc.cps.await__void(i32 %callee, i32 3), !continuation.registercount !1, !continuation.returnedRegistercount !1
-  store <4 x i32> %arg, <4 x i32> addrspace(1)* %mem
-  call void @lgc.cps.complete(), !continuation.registercount !1
-  unreachable
-}
-
-!continuation.maxPayloadRegisterCount = !{!2}
-!continuation.stackAddrspace = !{!3}
-
-!0 = !{}
-!1 = !{i32 0}
-!2 = !{i32 30}
-!3 = !{i32 21}
-; CLEANUP-LABEL: define void @simple_await(
-; CLEANUP-SAME: i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], <4 x i32> [[ARG:%.*]]) !continuation.registercount [[META2:![0-9]+]] !continuation [[META3:![0-9]+]] !continuation.stacksize [[META4:![0-9]+]] !continuation.state [[META4]] {
-; CLEANUP-NEXT:  AllocaSpillBB:
-; CLEANUP-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CLEANUP-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; CLEANUP-NEXT:    [[TMP7:%.*]] = load i32, ptr [[CSP]], align 4
-; CLEANUP-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 20
-; CLEANUP-NEXT:    store i32 [[TMP8]], ptr [[CSP]], align 4
-; CLEANUP-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[TMP7]] to ptr addrspace(21)
-; CLEANUP-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP2]], i32 0
-; CLEANUP-NEXT:    store <4 x i32> [[ARG]], ptr addrspace(21) [[TMP3]], align 4
-; CLEANUP-NEXT:    [[TMP4:%.*]] = add i32 [[TMP7]], 16
-; CLEANUP-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(21)
-; CLEANUP-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP5]], i32 0
-; CLEANUP-NEXT:    store i32 [[RETURNADDR]], ptr addrspace(21) [[TMP6]], align 4
-; CLEANUP-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i32
-; CLEANUP-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[CALLEE]] to ptr
-; CLEANUP-NEXT:    [[TMP1:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @simple_await.resume.0)
-; CLEANUP-NEXT:    [[TMP9:%.*]] = load i32, ptr [[CSP]], align 4
-; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i32 [[CALLEE]], i32 -1, i32 [[TMP9]], i32 [[TMP1]]), !continuation.registercount [[META2]], !continuation.returnedRegistercount [[META2]]
-; CLEANUP-NEXT:    unreachable
-;
-;
-; CLEANUP-LABEL: define dso_local void @simple_await.resume.0(
-; CLEANUP-SAME: i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]]) !continuation.registercount [[META2]] !continuation [[META3]] {
-; CLEANUP-NEXT:  entryresume.0:
-; CLEANUP-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CLEANUP-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; CLEANUP-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CSP]], align 4
-; CLEANUP-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], -20
-; CLEANUP-NEXT:    [[TMP3:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(21)
-; CLEANUP-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP3]], i32 0
-; CLEANUP-NEXT:    [[ARG_RELOAD:%.*]] = load <4 x i32>, ptr addrspace(21) [[TMP4]], align 4
-; CLEANUP-NEXT:    [[TMP5:%.*]] = add i32 [[TMP2]], 16
-; CLEANUP-NEXT:    [[TMP6:%.*]] = inttoptr i32 [[TMP5]] to ptr addrspace(21)
-; CLEANUP-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP6]], i32 0
-; CLEANUP-NEXT:    [[RETURNADDR_RELOAD:%.*]] = load i32, ptr addrspace(21) [[TMP7]], align 4
-; CLEANUP-NEXT:    [[TMP8:%.*]] = load i32, ptr [[CSP]], align 4
-; CLEANUP-NEXT:    [[TMP9:%.*]] = add i32 [[TMP8]], -20
-; CLEANUP-NEXT:    store i32 [[TMP9]], ptr [[CSP]], align 4
-; CLEANUP-NEXT:    [[TMP10:%.*]] = load i32, ptr [[CSP]], align 4
-; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD]], i32 -1, i32 [[TMP10]], i32 poison, <4 x i32> [[ARG_RELOAD]]), !continuation.registercount [[META2]]
-; CLEANUP-NEXT:    unreachable
-;
-;
-; CLEANUP-LABEL: define void @simple_await_entry(
-; CLEANUP-SAME: i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], <4 x i32> [[ARG:%.*]], ptr addrspace(1) [[MEM:%.*]]) !continuation.registercount [[META2]] !continuation.entry [[META5:![0-9]+]] !continuation [[META6:![0-9]+]] !continuation.stacksize [[META7:![0-9]+]] !continuation.state [[META7]] {
-; CLEANUP-NEXT:  AllocaSpillBB:
-; CLEANUP-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CLEANUP-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; CLEANUP-NEXT:    [[TMP7:%.*]] = load i32, ptr [[CSP]], align 4
-; CLEANUP-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 24
-; CLEANUP-NEXT:    store i32 [[TMP8]], ptr [[CSP]], align 4
-; CLEANUP-NEXT:    [[TMP2:%.*]] = add i32 [[TMP7]], 16
-; CLEANUP-NEXT:    [[TMP3:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(21)
-; CLEANUP-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP3]], i32 0
-; CLEANUP-NEXT:    store ptr addrspace(1) [[MEM]], ptr addrspace(21) [[TMP4]], align 4
-; CLEANUP-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP7]] to ptr addrspace(21)
-; CLEANUP-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP5]], i32 0
-; CLEANUP-NEXT:    store <4 x i32> [[ARG]], ptr addrspace(21) [[TMP6]], align 4
-; CLEANUP-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i32
-; CLEANUP-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[CALLEE]] to ptr
-; CLEANUP-NEXT:    [[TMP1:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @simple_await_entry.resume.0)
-; CLEANUP-NEXT:    [[TMP9:%.*]] = load i32, ptr [[CSP]], align 4
-; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i32 [[CALLEE]], i32 -1, i32 [[TMP9]], i32 [[TMP1]]), !continuation.registercount [[META2]], !continuation.returnedRegistercount [[META2]]
-; CLEANUP-NEXT:    unreachable
-;
-;
-; CLEANUP-LABEL: define dso_local void @simple_await_entry.resume.0(
-; CLEANUP-SAME: i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]]) !continuation.registercount [[META2]] !continuation [[META6]] {
-; CLEANUP-NEXT:  entryresume.0:
-; CLEANUP-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CLEANUP-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; CLEANUP-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CSP]], align 4
-; CLEANUP-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], -24
-; CLEANUP-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 16
-; CLEANUP-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(21)
-; CLEANUP-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP4]], i32 0
-; CLEANUP-NEXT:    [[MEM_RELOAD:%.*]] = load ptr addrspace(1), ptr addrspace(21) [[TMP5]], align 4
-; CLEANUP-NEXT:    [[TMP6:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(21)
-; CLEANUP-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP6]], i32 0
-; CLEANUP-NEXT:    [[ARG_RELOAD:%.*]] = load <4 x i32>, ptr addrspace(21) [[TMP7]], align 4
-; CLEANUP-NEXT:    store <4 x i32> [[ARG_RELOAD]], ptr addrspace(1) [[MEM_RELOAD]], align 4
-; CLEANUP-NEXT:    [[TMP8:%.*]] = load i32, ptr [[CSP]], align 4
-; CLEANUP-NEXT:    [[TMP9:%.*]] = add i32 [[TMP8]], -24
-; CLEANUP-NEXT:    store i32 [[TMP9]], ptr [[CSP]], align 4
-; CLEANUP-NEXT:    ret void
-;
diff --git a/llvmraytracing/test/dx/dxil-cont-convert-lgc-rt-op-trace-payload-type.ll b/llvmraytracing/test/dx/dxil-cont-convert-lgc-rt-op-trace-payload-type.ll
deleted file mode 100644
index a16097da5d..0000000000
--- a/llvmraytracing/test/dx/dxil-cont-convert-lgc-rt-op-trace-payload-type.ll
+++ /dev/null
@@ -1,276 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint' -S %s --lint-abort-on-error | FileCheck -check-prefix=PAYLOADTYPE-OPAQUE %s
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint' -S %s --lint-abort-on-error | FileCheck -check-prefix=PAYLOADTYPE2-OPAQUE %s
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint' -S %s --lint-abort-on-error | FileCheck -check-prefix=PAYLOADTYPE3-OPAQUE %s
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-%struct.MyParams = type { [48 x i32] }
-%struct.TheirParams = type { [64 x i32] }
-%struct.TheirParams2 = type { [65 x i32] }
-%struct.RayPayload = type { [68 x i32] }
-%struct.RayPayload2 = type { [64 x i32] }
-%dx.types.Handle = type { i8* }
-%dx.types.ResourceProperties = type { i32, i32 }
-%"class.RWTexture2D<vector<float, 4> >" = type { <4 x float> }
-
-@"\01?Scene@@3URaytracingAccelerationStructure@@A" = external constant %dx.types.Handle, align 4
-@"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" = external constant %dx.types.Handle, align 4
-
-%struct.DispatchSystemData = type { i32 }
-%struct.SystemData = type { %struct.DispatchSystemData }
-%struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
-
-; Function Attrs: nounwind
-declare !pointeetys !39 void @dx.op.traceRay.struct.RayPayload(i32, %dx.types.Handle, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, %struct.RayPayload*) #0
-declare !pointeetys !49 void @dx.op.traceRay.struct.RayPayload2(i32, %dx.types.Handle, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, %struct.RayPayload2*) #0
-
-; Function Attrs: nounwind readnone
-declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
-
-; Function Attrs: nounwind readonly
-declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.types.Handle) #2
-
-; Function Attrs: nounwind
-declare !pointeetys !40 void @dx.op.callShader.struct.TheirParams(i32, i32, %struct.TheirParams*) #0
-declare !pointeetys !41 void @dx.op.callShader.struct.TheirParams2(i32, i32, %struct.TheirParams2*) #0
-
-declare float @dx.op.rayTCurrent.f32(i32) #1
-declare float @dx.op.rayTMin.f32(i32) #2
-declare i32 @dx.op.hitKind.i32(i32) #2
-declare i32 @dx.op.instanceID.i32(i32) #2
-declare !pointeetys !42 i1 @dx.op.reportHit.struct.BuiltInTriangleIntersectionAttributes(i32, float, i32, %struct.BuiltInTriangleIntersectionAttributes*) #5
-
-define void @main() {
-; PAYLOADTYPE-LABEL: define void @main
-; PAYLOADTYPE:    call void (...) @lgc.rt.call.callable.shader(i32 1, %struct.TheirParams* %{{.*}}, i32 256), !cont.payload.type ![[call_callable_shader_payload_type:[0-9]+]]
-; PAYLOADTYPE-NEXT:    call void (...) @lgc.rt.call.callable.shader(i32 1, %struct.TheirParams* %{{.*}}, i32 256), !cont.payload.type ![[call_callable_shader_payload_type]]
-; PAYLOADTYPE-NEXT:    call void (...) @lgc.rt.call.callable.shader(i32 2, %struct.TheirParams2* %{{.*}}, i32 260), !cont.payload.type ![[call_callable_shader_payload_type2:[0-9]+]]
-; PAYLOADTYPE: ![[call_callable_shader_payload_type]] = !{%struct.TheirParams poison}
-; PAYLOADTYPE: ![[call_callable_shader_payload_type2]] = !{%struct.TheirParams2 poison}
-; PAYLOADTYPE-OPAQUE-LABEL: define void @main(
-; PAYLOADTYPE-OPAQUE-SAME: ) !lgc.rt.shaderstage [[META8:![0-9]+]] {
-; PAYLOADTYPE-OPAQUE-NEXT:    [[PARAMS:%.*]] = alloca [[STRUCT_THEIRPARAMS:%.*]], align 4
-; PAYLOADTYPE-OPAQUE-NEXT:    [[PARAMS2:%.*]] = alloca [[STRUCT_THEIRPARAMS2:%.*]], align 4
-; PAYLOADTYPE-OPAQUE-NEXT:    [[TMP1:%.*]] = call i32 @lgc.rt.shader.index()
-; PAYLOADTYPE-OPAQUE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[TMP1]])
-; PAYLOADTYPE-OPAQUE-NEXT:    call void (...) @lgc.rt.call.callable.shader(i32 1, ptr [[PARAMS]], i32 256), !cont.payload.type [[META19:![0-9]+]]
-; PAYLOADTYPE-OPAQUE-NEXT:    call void (...) @lgc.rt.call.callable.shader(i32 1, ptr [[PARAMS]], i32 256), !cont.payload.type [[META19]]
-; PAYLOADTYPE-OPAQUE-NEXT:    call void (...) @lgc.rt.call.callable.shader(i32 2, ptr [[PARAMS2]], i32 260), !cont.payload.type [[META20:![0-9]+]]
-; PAYLOADTYPE-OPAQUE-NEXT:    ret void
-;
-; PAYLOADTYPE2-OPAQUE-LABEL: define void @main(
-; PAYLOADTYPE2-OPAQUE-SAME: ) !lgc.rt.shaderstage [[META8:![0-9]+]] {
-; PAYLOADTYPE2-OPAQUE-NEXT:    [[PARAMS:%.*]] = alloca [[STRUCT_THEIRPARAMS:%.*]], align 4
-; PAYLOADTYPE2-OPAQUE-NEXT:    [[PARAMS2:%.*]] = alloca [[STRUCT_THEIRPARAMS2:%.*]], align 4
-; PAYLOADTYPE2-OPAQUE-NEXT:    [[TMP1:%.*]] = call i32 @lgc.rt.shader.index()
-; PAYLOADTYPE2-OPAQUE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[TMP1]])
-; PAYLOADTYPE2-OPAQUE-NEXT:    call void (...) @lgc.rt.call.callable.shader(i32 1, ptr [[PARAMS]], i32 256), !cont.payload.type [[META19:![0-9]+]]
-; PAYLOADTYPE2-OPAQUE-NEXT:    call void (...) @lgc.rt.call.callable.shader(i32 1, ptr [[PARAMS]], i32 256), !cont.payload.type [[META19]]
-; PAYLOADTYPE2-OPAQUE-NEXT:    call void (...) @lgc.rt.call.callable.shader(i32 2, ptr [[PARAMS2]], i32 260), !cont.payload.type [[META20:![0-9]+]]
-; PAYLOADTYPE2-OPAQUE-NEXT:    ret void
-;
-; PAYLOADTYPE3-OPAQUE-LABEL: define void @main(
-; PAYLOADTYPE3-OPAQUE-SAME: ) !lgc.rt.shaderstage [[META8:![0-9]+]] {
-; PAYLOADTYPE3-OPAQUE-NEXT:    [[PARAMS:%.*]] = alloca [[STRUCT_THEIRPARAMS:%.*]], align 4
-; PAYLOADTYPE3-OPAQUE-NEXT:    [[PARAMS2:%.*]] = alloca [[STRUCT_THEIRPARAMS2:%.*]], align 4
-; PAYLOADTYPE3-OPAQUE-NEXT:    [[TMP1:%.*]] = call i32 @lgc.rt.shader.index()
-; PAYLOADTYPE3-OPAQUE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[TMP1]])
-; PAYLOADTYPE3-OPAQUE-NEXT:    call void (...) @lgc.rt.call.callable.shader(i32 1, ptr [[PARAMS]], i32 256), !cont.payload.type [[META19:![0-9]+]]
-; PAYLOADTYPE3-OPAQUE-NEXT:    call void (...) @lgc.rt.call.callable.shader(i32 1, ptr [[PARAMS]], i32 256), !cont.payload.type [[META19]]
-; PAYLOADTYPE3-OPAQUE-NEXT:    call void (...) @lgc.rt.call.callable.shader(i32 2, ptr [[PARAMS2]], i32 260), !cont.payload.type [[META20:![0-9]+]]
-; PAYLOADTYPE3-OPAQUE-NEXT:    ret void
-;
-  %params = alloca %struct.TheirParams, align 4
-  %params2 = alloca %struct.TheirParams2, align 4
-  call void @dx.op.callShader.struct.TheirParams(i32 159, i32 1, %struct.TheirParams* nonnull %params)  ; CallShader(ShaderIndex,Parameter)
-  call void @dx.op.callShader.struct.TheirParams(i32 159, i32 1, %struct.TheirParams* nonnull %params)  ; CallShader(ShaderIndex,Parameter)
-  call void @dx.op.callShader.struct.TheirParams2(i32 159, i32 2, %struct.TheirParams2* nonnull %params2)  ; CallShader(ShaderIndex,Parameter)
-  ret void
-}
-
-define void @mainTrace() {
-; PAYLOADTYPE2-LABEL: define void @mainTrace
-; PAYLOADTYPE2:    call void (...) @lgc.rt.trace.ray(i64 %{{.*}}, i32 16, i32 -1, i32 0, i32 1, i32 0, <3 x float> zeroinitializer, float 0x3F50624DE0000000, <3 x float> <float 1.000000e+00, float 0.000000e+00, float 0.000000e+00>, float 1.000000e+04, %struct.RayPayload* %{{.*}}, [1 x i32] [i32 272]), !cont.payload.type ![[traceray_payload_type:[0-9]+]]
-; PAYLOADTYPE2:    call void (...) @lgc.rt.trace.ray(i64 %{{.*}}, i32 16, i32 -1, i32 0, i32 1, i32 0, <3 x float> zeroinitializer, float 0x3F50624DE0000000, <3 x float> <float 1.000000e+00, float 0.000000e+00, float 0.000000e+00>, float 1.000000e+04, %struct.RayPayload2* %{{.*}}, [1 x i32] [i32 256]), !cont.payload.type ![[traceray_payload_type2:[0-9]+]]
-; PAYLOADTYPE2: ![[traceray_payload_type]] = !{%struct.RayPayload poison}
-; PAYLOADTYPE2: ![[traceray_payload_type2]] = !{%struct.RayPayload2 poison}
-; PAYLOADTYPE-OPAQUE-LABEL: define void @mainTrace(
-; PAYLOADTYPE-OPAQUE-SAME: ) !lgc.rt.shaderstage [[META8]] {
-; PAYLOADTYPE-OPAQUE-NEXT:    [[TMP1:%.*]] = call i32 @lgc.rt.shader.index()
-; PAYLOADTYPE-OPAQUE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[TMP1]])
-; PAYLOADTYPE-OPAQUE-NEXT:    [[TMP2:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
-; PAYLOADTYPE-OPAQUE-NEXT:    [[TMP3:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
-; PAYLOADTYPE-OPAQUE-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 4
-; PAYLOADTYPE-OPAQUE-NEXT:    [[TMP5:%.*]] = alloca [[STRUCT_RAYPAYLOAD2:%.*]], align 4
-; PAYLOADTYPE-OPAQUE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0
-; PAYLOADTYPE-OPAQUE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD2]], ptr [[TMP5]], i32 0, i32 0
-; PAYLOADTYPE-OPAQUE-NEXT:    [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]])
-; PAYLOADTYPE-OPAQUE-NEXT:    [[TMP9:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP8]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
-; PAYLOADTYPE-OPAQUE-NEXT:    [[TMP10:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP9]])
-; PAYLOADTYPE-OPAQUE-NEXT:    call void (...) @lgc.rt.trace.ray(i64 [[TMP10]], i32 16, i32 -1, i32 0, i32 1, i32 0, <3 x float> zeroinitializer, float 0x3F50624DE0000000, <3 x float> <float 1.000000e+00, float 0.000000e+00, float 0.000000e+00>, float 1.000000e+04, ptr [[TMP4]], [1 x i32] [i32 272]), !cont.payload.type [[META17:![0-9]+]]
-; PAYLOADTYPE-OPAQUE-NEXT:    [[TMP11:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP9]])
-; PAYLOADTYPE-OPAQUE-NEXT:    call void (...) @lgc.rt.trace.ray(i64 [[TMP11]], i32 16, i32 -1, i32 0, i32 1, i32 0, <3 x float> zeroinitializer, float 0x3F50624DE0000000, <3 x float> <float 1.000000e+00, float 0.000000e+00, float 0.000000e+00>, float 1.000000e+04, ptr [[TMP5]], [1 x i32] [i32 256]), !cont.payload.type [[META18:![0-9]+]]
-; PAYLOADTYPE-OPAQUE-NEXT:    ret void
-;
-; PAYLOADTYPE2-OPAQUE-LABEL: define void @mainTrace(
-; PAYLOADTYPE2-OPAQUE-SAME: ) !lgc.rt.shaderstage [[META8]] {
-; PAYLOADTYPE2-OPAQUE-NEXT:    [[TMP1:%.*]] = call i32 @lgc.rt.shader.index()
-; PAYLOADTYPE2-OPAQUE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[TMP1]])
-; PAYLOADTYPE2-OPAQUE-NEXT:    [[TMP2:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
-; PAYLOADTYPE2-OPAQUE-NEXT:    [[TMP3:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
-; PAYLOADTYPE2-OPAQUE-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 4
-; PAYLOADTYPE2-OPAQUE-NEXT:    [[TMP5:%.*]] = alloca [[STRUCT_RAYPAYLOAD2:%.*]], align 4
-; PAYLOADTYPE2-OPAQUE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0
-; PAYLOADTYPE2-OPAQUE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD2]], ptr [[TMP5]], i32 0, i32 0
-; PAYLOADTYPE2-OPAQUE-NEXT:    [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]])
-; PAYLOADTYPE2-OPAQUE-NEXT:    [[TMP9:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP8]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
-; PAYLOADTYPE2-OPAQUE-NEXT:    [[TMP10:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP9]])
-; PAYLOADTYPE2-OPAQUE-NEXT:    call void (...) @lgc.rt.trace.ray(i64 [[TMP10]], i32 16, i32 -1, i32 0, i32 1, i32 0, <3 x float> zeroinitializer, float 0x3F50624DE0000000, <3 x float> <float 1.000000e+00, float 0.000000e+00, float 0.000000e+00>, float 1.000000e+04, ptr [[TMP4]], [1 x i32] [i32 272]), !cont.payload.type [[META17:![0-9]+]]
-; PAYLOADTYPE2-OPAQUE-NEXT:    [[TMP11:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP9]])
-; PAYLOADTYPE2-OPAQUE-NEXT:    call void (...) @lgc.rt.trace.ray(i64 [[TMP11]], i32 16, i32 -1, i32 0, i32 1, i32 0, <3 x float> zeroinitializer, float 0x3F50624DE0000000, <3 x float> <float 1.000000e+00, float 0.000000e+00, float 0.000000e+00>, float 1.000000e+04, ptr [[TMP5]], [1 x i32] [i32 256]), !cont.payload.type [[META18:![0-9]+]]
-; PAYLOADTYPE2-OPAQUE-NEXT:    ret void
-;
-; PAYLOADTYPE3-OPAQUE-LABEL: define void @mainTrace(
-; PAYLOADTYPE3-OPAQUE-SAME: ) !lgc.rt.shaderstage [[META8]] {
-; PAYLOADTYPE3-OPAQUE-NEXT:    [[TMP1:%.*]] = call i32 @lgc.rt.shader.index()
-; PAYLOADTYPE3-OPAQUE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[TMP1]])
-; PAYLOADTYPE3-OPAQUE-NEXT:    [[TMP2:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
-; PAYLOADTYPE3-OPAQUE-NEXT:    [[TMP3:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
-; PAYLOADTYPE3-OPAQUE-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 4
-; PAYLOADTYPE3-OPAQUE-NEXT:    [[TMP5:%.*]] = alloca [[STRUCT_RAYPAYLOAD2:%.*]], align 4
-; PAYLOADTYPE3-OPAQUE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0
-; PAYLOADTYPE3-OPAQUE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD2]], ptr [[TMP5]], i32 0, i32 0
-; PAYLOADTYPE3-OPAQUE-NEXT:    [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]])
-; PAYLOADTYPE3-OPAQUE-NEXT:    [[TMP9:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP8]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
-; PAYLOADTYPE3-OPAQUE-NEXT:    [[TMP10:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP9]])
-; PAYLOADTYPE3-OPAQUE-NEXT:    call void (...) @lgc.rt.trace.ray(i64 [[TMP10]], i32 16, i32 -1, i32 0, i32 1, i32 0, <3 x float> zeroinitializer, float 0x3F50624DE0000000, <3 x float> <float 1.000000e+00, float 0.000000e+00, float 0.000000e+00>, float 1.000000e+04, ptr [[TMP4]], [1 x i32] [i32 272]), !cont.payload.type [[META17:![0-9]+]]
-; PAYLOADTYPE3-OPAQUE-NEXT:    [[TMP11:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP9]])
-; PAYLOADTYPE3-OPAQUE-NEXT:    call void (...) @lgc.rt.trace.ray(i64 [[TMP11]], i32 16, i32 -1, i32 0, i32 1, i32 0, <3 x float> zeroinitializer, float 0x3F50624DE0000000, <3 x float> <float 1.000000e+00, float 0.000000e+00, float 0.000000e+00>, float 1.000000e+04, ptr [[TMP5]], [1 x i32] [i32 256]), !cont.payload.type [[META18:![0-9]+]]
-; PAYLOADTYPE3-OPAQUE-NEXT:    ret void
-;
-  %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
-  %2 = load %dx.types.Handle, %dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
-  %3 = alloca %struct.RayPayload, align 4
-  %4 = alloca %struct.RayPayload2, align 4
-  %5 = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %3, i32 0, i32 0
-  %6 = getelementptr inbounds %struct.RayPayload2, %struct.RayPayload2* %4, i32 0, i32 0
-  %7 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1)  ; CreateHandleForLib(Resource)
-  %8 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %7, %dx.types.ResourceProperties { i32 16, i32 0 })  ; AnnotateHandle(res,props)  resource: RTAccelerationStructure
-  call void @dx.op.traceRay.struct.RayPayload(i32 157, %dx.types.Handle %8, i32 16, i32 -1, i32 0, i32 1, i32 0, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0x3F50624DE0000000, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+04, %struct.RayPayload* nonnull %3)  ; TraceRay(AccelerationStructure,RayFlags,InstanceInclusionMask,RayContributionToHitGroupIndex,MultiplierForGeometryContributionToShaderIndex,MissShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
-  call void @dx.op.traceRay.struct.RayPayload2(i32 157, %dx.types.Handle %8, i32 16, i32 -1, i32 0, i32 1, i32 0, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0x3F50624DE0000000, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+04, %struct.RayPayload2* nonnull %4)  ; TraceRay(AccelerationStructure,RayFlags,InstanceInclusionMask,RayContributionToHitGroupIndex,MultiplierForGeometryContributionToShaderIndex,MissShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
-  ret void
-}
-
-define void @called(%struct.MyParams* %arg) !pointeetys !38 {
-; PAYLOADTYPE3-LABEL: define void @called
-; PAYLOADTYPE3:    call void (...) @lgc.rt.call.callable.shader(i32 2, %struct.TheirParams2* %{{.*}}, i32 260), !cont.payload.type ![[call_callable_shader_payload_type:[0-9]+]]
-; PAYLOADTYPE3: ![[call_callable_shader_payload_type]] = !{%struct.TheirParams2 poison}
-; PAYLOADTYPE-OPAQUE-LABEL: define void @called(
-; PAYLOADTYPE-OPAQUE-SAME: ptr [[ARG:%.*]]) !pointeetys [[META22:![0-9]+]] !lgc.rt.shaderstage [[META23:![0-9]+]] !cont.payload.type [[META22]] {
-; PAYLOADTYPE-OPAQUE-NEXT:    [[PARAMS:%.*]] = alloca [[STRUCT_THEIRPARAMS2:%.*]], align 4
-; PAYLOADTYPE-OPAQUE-NEXT:    [[TMP1:%.*]] = call i32 @lgc.rt.shader.index()
-; PAYLOADTYPE-OPAQUE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[TMP1]])
-; PAYLOADTYPE-OPAQUE-NEXT:    call void (...) @lgc.rt.call.callable.shader(i32 2, ptr [[PARAMS]], i32 260), !cont.payload.type [[META20]]
-; PAYLOADTYPE-OPAQUE-NEXT:    ret void
-;
-; PAYLOADTYPE2-OPAQUE-LABEL: define void @called(
-; PAYLOADTYPE2-OPAQUE-SAME: ptr [[ARG:%.*]]) !pointeetys [[META22:![0-9]+]] !lgc.rt.shaderstage [[META23:![0-9]+]] !cont.payload.type [[META22]] {
-; PAYLOADTYPE2-OPAQUE-NEXT:    [[PARAMS:%.*]] = alloca [[STRUCT_THEIRPARAMS2:%.*]], align 4
-; PAYLOADTYPE2-OPAQUE-NEXT:    [[TMP1:%.*]] = call i32 @lgc.rt.shader.index()
-; PAYLOADTYPE2-OPAQUE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[TMP1]])
-; PAYLOADTYPE2-OPAQUE-NEXT:    call void (...) @lgc.rt.call.callable.shader(i32 2, ptr [[PARAMS]], i32 260), !cont.payload.type [[META20]]
-; PAYLOADTYPE2-OPAQUE-NEXT:    ret void
-;
-; PAYLOADTYPE3-OPAQUE-LABEL: define void @called(
-; PAYLOADTYPE3-OPAQUE-SAME: ptr [[ARG:%.*]]) !pointeetys [[META22:![0-9]+]] !lgc.rt.shaderstage [[META23:![0-9]+]] !cont.payload.type [[META22]] {
-; PAYLOADTYPE3-OPAQUE-NEXT:    [[PARAMS:%.*]] = alloca [[STRUCT_THEIRPARAMS2:%.*]], align 4
-; PAYLOADTYPE3-OPAQUE-NEXT:    [[TMP1:%.*]] = call i32 @lgc.rt.shader.index()
-; PAYLOADTYPE3-OPAQUE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[TMP1]])
-; PAYLOADTYPE3-OPAQUE-NEXT:    call void (...) @lgc.rt.call.callable.shader(i32 2, ptr [[PARAMS]], i32 260), !cont.payload.type [[META20]]
-; PAYLOADTYPE3-OPAQUE-NEXT:    ret void
-;
-  %params = alloca %struct.TheirParams2, align 4
-  call void @dx.op.callShader.struct.TheirParams2(i32 159, i32 2, %struct.TheirParams2* nonnull %params)  ; CallShader(ShaderIndex,Parameter)
-  ret void
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readonly }
-
-!llvm.ident = !{!0}
-!dx.version = !{!1}
-!dx.valver = !{!1}
-!dx.shaderModel = !{!2}
-!dx.entryPoints = !{!18, !5, !34, !36}
-
-!0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"}
-!1 = !{i32 1, i32 6}
-!2 = !{!"lib", i32 6, i32 6}
-!3 = !{!4, !7, null, null}
-!4 = !{!5}
-!5 = !{void ()* @"main", !"main", null, null, !21}
-!6 = !{i32 0, i32 4}
-!7 = !{!8}
-!8 = !{i32 0, %"class.RWTexture2D<vector<float, 4> >"* bitcast (%dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" to %"class.RWTexture2D<vector<float, 4> >"*), !"RenderTarget", i32 0, i32 0, i32 1, i32 2, i1 false, i1 false, i1 false, !9}
-!9 = !{i32 0, i32 9}
-!11 = !{!12}
-!12 = !{i32 1, !13, !13}
-!13 = !{}
-!14 = !{!12, !15, !16}
-!15 = !{i32 2, !13, !13}
-!16 = !{i32 0, !13, !13}
-!17 = !{!12, !15}
-!18 = !{null, !"", null, !3, !19}
-!19 = !{i32 0, i64 65536}
-!21 = !{i32 8, i32 7, i32 6, i32 16, i32 7, i32 8, i32 5, !22}
-!22 = !{i32 0}
-!24 = !{i32 8, i32 10, i32 6, i32 16, i32 7, i32 8, i32 5, !22}
-!26 = !{i32 8, i32 8, i32 5, !22}
-!28 = !{i32 8, i32 11, i32 6, i32 16, i32 5, !22}
-!30 = !{i32 8, i32 7, i32 5, !22}
-!31 = !{!32, !32, i64 0}
-!32 = !{!"omnipotent char", !33, i64 0}
-!33 = !{!"Simple C/C++ TBAA"}
-!34 = !{void (%struct.MyParams*)* @called, !"called", null, null, !35}
-!35 = !{i32 8, i32 12}
-!36 = !{void ()* @mainTrace, !"mainTrace", null, null, !37}
-!37 = !{i32 8, i32 7}
-!38 = !{%struct.MyParams poison}
-!39 = !{%struct.RayPayload poison}
-!40 = !{%struct.TheirParams poison}
-!41 = !{%struct.TheirParams2 poison}
-!42 = !{%struct.BuiltInTriangleIntersectionAttributes poison}
-!43 = !{i32 0, %struct.MyParams poison}
-!44 = !{i32 0, %struct.RayPayload poison}
-!45 = !{i32 0, %struct.TheirParams poison}
-!46 = !{i32 0, %struct.TheirParams2 poison}
-!47 = !{i32 0, %struct.BuiltInTriangleIntersectionAttributes poison}
-!48 = !{i32 0, %struct.RayPayload2 poison}
-!49 = !{%struct.RayPayload2 poison}
-;.
-; PAYLOADTYPE-OPAQUE: [[META8]] = !{i32 0}
-; PAYLOADTYPE-OPAQUE: [[META17]] = !{%struct.RayPayload poison}
-; PAYLOADTYPE-OPAQUE: [[META18]] = !{%struct.RayPayload2 poison}
-; PAYLOADTYPE-OPAQUE: [[META19]] = !{%struct.TheirParams poison}
-; PAYLOADTYPE-OPAQUE: [[META20]] = !{%struct.TheirParams2 poison}
-; PAYLOADTYPE-OPAQUE: [[META22]] = !{%struct.MyParams poison}
-; PAYLOADTYPE-OPAQUE: [[META23]] = !{i32 5}
-;.
-; PAYLOADTYPE2-OPAQUE: [[META8]] = !{i32 0}
-; PAYLOADTYPE2-OPAQUE: [[META17]] = !{%struct.RayPayload poison}
-; PAYLOADTYPE2-OPAQUE: [[META18]] = !{%struct.RayPayload2 poison}
-; PAYLOADTYPE2-OPAQUE: [[META19]] = !{%struct.TheirParams poison}
-; PAYLOADTYPE2-OPAQUE: [[META20]] = !{%struct.TheirParams2 poison}
-; PAYLOADTYPE2-OPAQUE: [[META22]] = !{%struct.MyParams poison}
-; PAYLOADTYPE2-OPAQUE: [[META23]] = !{i32 5}
-;.
-; PAYLOADTYPE3-OPAQUE: [[META8]] = !{i32 0}
-; PAYLOADTYPE3-OPAQUE: [[META17]] = !{%struct.RayPayload poison}
-; PAYLOADTYPE3-OPAQUE: [[META18]] = !{%struct.RayPayload2 poison}
-; PAYLOADTYPE3-OPAQUE: [[META19]] = !{%struct.TheirParams poison}
-; PAYLOADTYPE3-OPAQUE: [[META20]] = !{%struct.TheirParams2 poison}
-; PAYLOADTYPE3-OPAQUE: [[META22]] = !{%struct.MyParams poison}
-; PAYLOADTYPE3-OPAQUE: [[META23]] = !{i32 5}
-;.
diff --git a/llvmraytracing/test/dx/dxil-cont-convert-lgc-rt-op-trace.ll b/llvmraytracing/test/dx/dxil-cont-convert-lgc-rt-op-trace.ll
deleted file mode 100644
index 63527fa776..0000000000
--- a/llvmraytracing/test/dx/dxil-cont-convert-lgc-rt-op-trace.ll
+++ /dev/null
@@ -1,165 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint' -S %s --lint-abort-on-error | FileCheck %s
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-%struct.MyParams = type { [48 x i32] }
-%struct.TheirParams = type { [64 x i32] }
-%struct.TheirParams2 = type { [65 x i32] }
-%struct.RayPayload = type { [68 x i32] }
-%dx.types.Handle = type { i8* }
-%dx.types.ResourceProperties = type { i32, i32 }
-%"class.RWTexture2D<vector<float, 4> >" = type { <4 x float> }
-
-@"\01?Scene@@3URaytracingAccelerationStructure@@A" = external constant %dx.types.Handle, align 4
-@"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" = external constant %dx.types.Handle, align 4
-
-%struct.DispatchSystemData = type { i32 }
-%struct.SystemData = type { %struct.DispatchSystemData }
-%struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
-
-; Function Attrs: nounwind
-declare !pointeetys !39 void @dx.op.traceRay.struct.RayPayload(i32, %dx.types.Handle, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, %struct.RayPayload*) #0
-
-; Function Attrs: nounwind readnone
-declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
-
-; Function Attrs: nounwind readonly
-declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.types.Handle) #2
-
-; Function Attrs: nounwind
-declare !pointeetys !40 void @dx.op.callShader.struct.TheirParams(i32, i32, %struct.TheirParams*) #0
-declare !pointeetys !41 void @dx.op.callShader.struct.TheirParams2(i32, i32, %struct.TheirParams2*) #0
-
-declare float @dx.op.rayTCurrent.f32(i32) #1
-declare float @dx.op.rayTMin.f32(i32) #2
-declare i32 @dx.op.hitKind.i32(i32) #2
-declare i32 @dx.op.instanceID.i32(i32) #2
-declare !pointeetys !42 i1 @dx.op.reportHit.struct.BuiltInTriangleIntersectionAttributes(i32, float, i32, %struct.BuiltInTriangleIntersectionAttributes*) #5
-
-; Function Attrs: nounwind
-define void @Intersection() #0 {
-; CHECK-LABEL: define void @Intersection(
-; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call float @lgc.rt.ray.tmin()
-; CHECK-NEXT:    [[TMP2:%.*]] = call float @lgc.rt.ray.tcurrent()
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @lgc.rt.instance.id()
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @lgc.rt.hit.kind()
-; CHECK-NEXT:    [[TMP5:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = call i1 (...) @lgc.rt.report.hit(float 4.000000e+00, i32 0, ptr [[TMP5]], i32 8), !cont.payload.type [[META20:![0-9]+]]
-; CHECK-NEXT:    ret void
-;
-  %1 = call float @dx.op.rayTMin.f32(i32 153)  ; RayTMin()
-  %2 = call float @dx.op.rayTCurrent.f32(i32 154)  ; RayTCurrent()
-  %3 = call i32 @dx.op.instanceID.i32(i32 141)  ; InstanceID()
-  %4 = call i32 @dx.op.hitKind.i32(i32 143)  ; HitKind()
-  %5 = alloca %struct.BuiltInTriangleIntersectionAttributes, align 4
-  %6 = call i1 @dx.op.reportHit.struct.BuiltInTriangleIntersectionAttributes(i32 158, float 4.0, i32 0, %struct.BuiltInTriangleIntersectionAttributes* nonnull %5)  ; ReportHit(THit,HitKind,Attributes)
-  ret void
-}
-
-define void @main() {
-; CHECK-LABEL: define void @main(
-; CHECK-SAME: ) !lgc.rt.shaderstage [[META8:![0-9]+]] {
-; CHECK-NEXT:    [[PARAMS:%.*]] = alloca [[STRUCT_THEIRPARAMS:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @lgc.rt.shader.index()
-; CHECK-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[TMP1]])
-; CHECK-NEXT:    call void (...) @lgc.rt.call.callable.shader(i32 1, ptr [[PARAMS]], i32 256), !cont.payload.type [[META18:![0-9]+]]
-; CHECK-NEXT:    ret void
-;
-  %params = alloca %struct.TheirParams, align 4
-  call void @dx.op.callShader.struct.TheirParams(i32 159, i32 1, %struct.TheirParams* nonnull %params)  ; CallShader(ShaderIndex,Parameter)
-  ret void
-}
-
-define void @mainTrace() {
-; CHECK-LABEL: define void @mainTrace(
-; CHECK-SAME: ) !lgc.rt.shaderstage [[META8]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @lgc.rt.shader.index()
-; CHECK-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[TMP1]])
-; CHECK-NEXT:    [[TMP2:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]])
-; CHECK-NEXT:    [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP6]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP7]])
-; CHECK-NEXT:    call void (...) @lgc.rt.trace.ray(i64 [[TMP8]], i32 16, i32 -1, i32 0, i32 1, i32 0, <3 x float> zeroinitializer, float 0x3F50624DE0000000, <3 x float> <float 1.000000e+00, float 0.000000e+00, float 0.000000e+00>, float 1.000000e+04, ptr [[TMP4]], [1 x i32] [i32 272]), !cont.payload.type [[META17:![0-9]+]]
-; CHECK-NEXT:    ret void
-;
-  %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
-  %2 = load %dx.types.Handle, %dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
-  %3 = alloca %struct.RayPayload, align 4
-  %4 = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %3, i32 0, i32 0
-  %5 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1)  ; CreateHandleForLib(Resource)
-  %6 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %5, %dx.types.ResourceProperties { i32 16, i32 0 })  ; AnnotateHandle(res,props)  resource: RTAccelerationStructure
-  call void @dx.op.traceRay.struct.RayPayload(i32 157, %dx.types.Handle %6, i32 16, i32 -1, i32 0, i32 1, i32 0, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0x3F50624DE0000000, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+04, %struct.RayPayload* nonnull %3)  ; TraceRay(AccelerationStructure,RayFlags,InstanceInclusionMask,RayContributionToHitGroupIndex,MultiplierForGeometryContributionToShaderIndex,MissShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload)
-  ret void
-}
-
-define void @called(%struct.MyParams* %arg) !pointeetys !38 {
-; CHECK-LABEL: define void @called(
-; CHECK-SAME: ptr [[ARG:%.*]]) !pointeetys [[META21:![0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !cont.payload.type [[META21]] {
-; CHECK-NEXT:    [[PARAMS:%.*]] = alloca [[STRUCT_THEIRPARAMS2:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @lgc.rt.shader.index()
-; CHECK-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[TMP1]])
-; CHECK-NEXT:    call void (...) @lgc.rt.call.callable.shader(i32 2, ptr [[PARAMS]], i32 260), !cont.payload.type [[META19:![0-9]+]]
-; CHECK-NEXT:    ret void
-;
-  %params = alloca %struct.TheirParams2, align 4
-  call void @dx.op.callShader.struct.TheirParams2(i32 159, i32 2, %struct.TheirParams2* nonnull %params)  ; CallShader(ShaderIndex,Parameter)
-  ret void
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readonly }
-
-!llvm.ident = !{!0}
-!dx.version = !{!1}
-!dx.valver = !{!1}
-!dx.shaderModel = !{!2}
-!dx.entryPoints = !{!18, !5, !34, !36}
-
-!0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"}
-!1 = !{i32 1, i32 6}
-!2 = !{!"lib", i32 6, i32 6}
-!3 = !{!4, !7, null, null}
-!4 = !{!5}
-!5 = !{void ()* @"main", !"main", null, null, !21}
-!6 = !{i32 0, i32 4}
-!7 = !{!8}
-!8 = !{i32 0, %"class.RWTexture2D<vector<float, 4> >"* bitcast (%dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" to %"class.RWTexture2D<vector<float, 4> >"*), !"RenderTarget", i32 0, i32 0, i32 1, i32 2, i1 false, i1 false, i1 false, !9}
-!9 = !{i32 0, i32 9}
-!11 = !{!12}
-!12 = !{i32 1, !13, !13}
-!13 = !{}
-!14 = !{!12, !15, !16}
-!15 = !{i32 2, !13, !13}
-!16 = !{i32 0, !13, !13}
-!17 = !{!12, !15}
-!18 = !{null, !"", null, !3, !19}
-!19 = !{i32 0, i64 65536}
-!21 = !{i32 8, i32 7, i32 6, i32 16, i32 7, i32 8, i32 5, !22}
-!22 = !{i32 0}
-!24 = !{i32 8, i32 10, i32 6, i32 16, i32 7, i32 8, i32 5, !22}
-!26 = !{i32 8, i32 8, i32 5, !22}
-!28 = !{i32 8, i32 11, i32 6, i32 16, i32 5, !22}
-!30 = !{i32 8, i32 7, i32 5, !22}
-!31 = !{!32, !32, i64 0}
-!32 = !{!"omnipotent char", !33, i64 0}
-!33 = !{!"Simple C/C++ TBAA"}
-!34 = !{void (%struct.MyParams*)* @called, !"called", null, null, !35}
-!35 = !{i32 8, i32 12}
-!36 = !{void ()* @mainTrace, !"mainTrace", null, null, !37}
-!37 = !{i32 8, i32 7}
-!38 = !{%struct.MyParams poison}
-!39 = !{%struct.RayPayload poison}
-!40 = !{%struct.TheirParams poison}
-!41 = !{%struct.TheirParams2 poison}
-!42 = !{%struct.BuiltInTriangleIntersectionAttributes poison}
-!43 = !{i32 0, %struct.MyParams poison}
-!44 = !{i32 0, %struct.RayPayload poison}
-!45 = !{i32 0, %struct.TheirParams poison}
-!46 = !{i32 0, %struct.TheirParams2 poison}
-!47 = !{i32 0, %struct.BuiltInTriangleIntersectionAttributes poison}
diff --git a/llvmraytracing/test/dx/dxil-cont-convert-lgc-rt-op.ll b/llvmraytracing/test/dx/dxil-cont-convert-lgc-rt-op.ll
deleted file mode 100644
index ebf780d7c8..0000000000
--- a/llvmraytracing/test/dx/dxil-cont-convert-lgc-rt-op.ll
+++ /dev/null
@@ -1,139 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function ClosestHit --version 3
-; RUN: opt --verify-each -passes='dxil-cont-prepare-gpurt-library,lint,dxil-cont-lgc-rt-op-converter,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck %s
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-%struct.DispatchSystemData = type { <3 x i32> }
-%struct.SystemData = type { %struct.DispatchSystemData }
-
-%dx.types.Handle = type { i8* }
-%struct.RaytracingAccelerationStructure = type { i32 }
-%"class.RWTexture2D<vector<float, 4> >" = type { <4 x float> }
-%struct.RayPayload = type { float, float, i32, i32 }
-%struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
-
-@"\01?Scene@@3URaytracingAccelerationStructure@@A" = external constant %dx.types.Handle, align 4
-@"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" = external constant %dx.types.Handle, align 4
-
-; Function Attrs: nounwind
-define void @ClosestHit(%struct.RayPayload*, %struct.BuiltInTriangleIntersectionAttributes*) #0 !pointeetys !31 {
-; CHECK-LABEL: define void @ClosestHit(
-; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] !cont.payload.type [[META19:![0-9]+]] !lgc.rt.shaderstage [[META20:![0-9]+]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = alloca [4 x <3 x float>], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = alloca [4 x <3 x float>], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = alloca [4 x <3 x float>], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = alloca [4 x <3 x float>], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @lgc.rt.shader.index()
-; CHECK-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[TMP7]])
-; CHECK-NEXT:    [[TMP8:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
-; CHECK-NEXT:    [[A:%.*]] = extractelement <3 x i32> [[TMP8]], i8 0
-; CHECK-NEXT:    [[TMP9:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.dimensions()
-; CHECK-NEXT:    [[B:%.*]] = extractelement <3 x i32> [[TMP9]], i8 0
-; CHECK-NEXT:    [[TMP10:%.*]] = call <3 x float> @lgc.rt.world.ray.origin()
-; CHECK-NEXT:    [[C:%.*]] = extractelement <3 x float> [[TMP10]], i8 0
-; CHECK-NEXT:    [[TMP11:%.*]] = call <3 x float> @lgc.rt.world.ray.direction()
-; CHECK-NEXT:    [[D:%.*]] = extractelement <3 x float> [[TMP11]], i8 0
-; CHECK-NEXT:    [[E:%.*]] = call float @lgc.rt.ray.tmin()
-; CHECK-NEXT:    [[F:%.*]] = call float @lgc.rt.ray.tcurrent()
-; CHECK-NEXT:    [[G:%.*]] = call i32 @lgc.rt.ray.flags()
-; CHECK-NEXT:    [[H:%.*]] = call i32 @lgc.rt.instance.index()
-; CHECK-NEXT:    [[I:%.*]] = call i32 @lgc.rt.instance.id()
-; CHECK-NEXT:    [[J:%.*]] = call i32 @lgc.rt.primitive.index()
-; CHECK-NEXT:    [[TMP12:%.*]] = call <3 x float> @lgc.rt.object.ray.origin()
-; CHECK-NEXT:    [[K:%.*]] = extractelement <3 x float> [[TMP12]], i8 0
-; CHECK-NEXT:    [[TMP13:%.*]] = call <3 x float> @lgc.rt.object.ray.direction()
-; CHECK-NEXT:    [[L:%.*]] = extractelement <3 x float> [[TMP13]], i8 0
-; CHECK-NEXT:    [[TMP14:%.*]] = call [4 x <3 x float>] @lgc.rt.object.to.world()
-; CHECK-NEXT:    store [4 x <3 x float>] [[TMP14]], ptr [[TMP4]], align 4
-; CHECK-NEXT:    [[COL_GEP1:%.*]] = getelementptr [4 x <3 x float>], ptr [[TMP4]], i32 0, i8 0
-; CHECK-NEXT:    [[COL_GEP_LOAD2:%.*]] = load <3 x float>, ptr [[COL_GEP1]], align 4
-; CHECK-NEXT:    [[M:%.*]] = extractelement <3 x float> [[COL_GEP_LOAD2]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = call [4 x <3 x float>] @lgc.rt.world.to.object()
-; CHECK-NEXT:    store [4 x <3 x float>] [[TMP15]], ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[COL_GEP:%.*]] = getelementptr [4 x <3 x float>], ptr [[TMP3]], i32 0, i8 0
-; CHECK-NEXT:    [[COL_GEP_LOAD:%.*]] = load <3 x float>, ptr [[COL_GEP]], align 4
-; CHECK-NEXT:    [[N:%.*]] = extractelement <3 x float> [[COL_GEP_LOAD]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = call [4 x <3 x float>] @lgc.rt.object.to.world()
-; CHECK-NEXT:    store [4 x <3 x float>] [[TMP16]], ptr [[TMP5]], align 4
-; CHECK-NEXT:    [[COL_GEP3:%.*]] = getelementptr [4 x <3 x float>], ptr [[TMP5]], i32 0, i8 3
-; CHECK-NEXT:    [[COL_GEP_LOAD4:%.*]] = load <3 x float>, ptr [[COL_GEP3]], align 4
-; CHECK-NEXT:    [[O:%.*]] = extractelement <3 x float> [[COL_GEP_LOAD4]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = call [4 x <3 x float>] @lgc.rt.object.to.world()
-; CHECK-NEXT:    store [4 x <3 x float>] [[TMP17]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[COL_GEP5:%.*]] = getelementptr [4 x <3 x float>], ptr [[TMP6]], i32 0, i8 3
-; CHECK-NEXT:    [[COL_GEP_LOAD6:%.*]] = load <3 x float>, ptr [[COL_GEP5]], align 4
-; CHECK-NEXT:    [[P:%.*]] = extractelement <3 x float> [[COL_GEP_LOAD6]], i32 2
-; CHECK-NEXT:    [[Q:%.*]] = call i32 @lgc.rt.hit.kind()
-; CHECK-NEXT:    ret void
-;
-  %a = call i32 @dx.op.dispatchRaysIndex.i32(i32 145, i8 0)  ; DispatchRaysIndex(col)
-  %b = call i32 @dx.op.dispatchRaysDimensions.i32(i32 146, i8 0)  ; DispatchRaysDimensions(col)
-  %c = call float @dx.op.worldRayOrigin.f32(i32 147, i8 0)  ; WorldRayOrigin(col)
-  %d = call float @dx.op.worldRayDirection.f32(i32 148, i8 0)  ; WorldRayDirection(col)
-  %e = call float @dx.op.rayTMin.f32(i32 153)  ; RayTMin()
-  %f = call float @dx.op.rayTCurrent.f32(i32 154)  ; RayTCurrent()
-  %g = call i32 @dx.op.rayFlags.i32(i32 144)  ; RayFlags()
-  %h = call i32 @dx.op.instanceIndex.i32(i32 142)  ; InstanceIndex()
-  %i = call i32 @dx.op.instanceID.i32(i32 141)  ; InstanceID()
-  %j = call i32 @dx.op.primitiveIndex.i32(i32 161)  ; PrimitiveIndex()
-  %k = call float @dx.op.objectRayOrigin.f32(i32 149, i8 0)  ; ObjectRayOrigin(col)
-  %l = call float @dx.op.objectRayDirection.f32(i32 150, i8 0)  ; ObjectRayDirection(col)
-  %m = call float @dx.op.objectToWorld.f32(i32 151, i32 0, i8 0)  ; ObjectToWorld(row,col)
-  %n = call float @dx.op.worldToObject.f32(i32 152, i32 0, i8 0)  ; WorldToObject(row,col)
-  %o = call float @dx.op.objectToWorld.f32(i32 151, i32 0, i8 3)  ; ObjectToWorld(row,col)
-  %p = call float @dx.op.objectToWorld.f32(i32 151, i32 2, i8 3)  ; ObjectToWorld(row,col)
-  %q = call i32 @dx.op.hitKind.i32(i32 143)  ; HitKind()
-  ret void
-}
-
-declare i32 @dx.op.dispatchRaysDimensions.i32(i32, i8) #2
-declare i32 @dx.op.dispatchRaysIndex.i32(i32, i8) #2
-declare float @dx.op.objectRayDirection.f32(i32, i8) #2
-declare float @dx.op.objectRayOrigin.f32(i32, i8) #2
-declare float @dx.op.worldRayDirection.f32(i32, i8) #2
-declare float @dx.op.worldRayOrigin.f32(i32, i8) #2
-declare float @dx.op.rayTCurrent.f32(i32) #1
-declare float @dx.op.rayTMin.f32(i32) #2
-declare i32 @dx.op.hitKind.i32(i32) #2
-declare i32 @dx.op.primitiveIndex.i32(i32) #2
-declare i32 @dx.op.instanceID.i32(i32) #2
-declare i32 @dx.op.instanceIndex.i32(i32) #2
-declare i32 @dx.op.rayFlags.i32(i32) #2
-declare float @dx.op.worldToObject.f32(i32, i32, i8) #2
-declare float @dx.op.objectToWorld.f32(i32, i32, i8) #2
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readonly }
-attributes #2 = { nounwind readnone }
-attributes #3 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!llvm.ident = !{!0}
-!dx.version = !{!1}
-!dx.valver = !{!1}
-!dx.shaderModel = !{!2}
-!dx.resources = !{!3}
-!dx.typeAnnotations = !{!10}
-!dx.entryPoints = !{!18, !29}
-
-!0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"}
-!1 = !{i32 1, i32 6}
-!2 = !{!"lib", i32 6, i32 6}
-!3 = !{!4, !7, null, null}
-!4 = !{!5}
-!5 = !{i32 0, %struct.RaytracingAccelerationStructure* bitcast (%dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A" to %struct.RaytracingAccelerationStructure*), !"Scene", i32 0, i32 0, i32 1, i32 16, i32 0, !6}
-!6 = !{i32 0, i32 4}
-!7 = !{!8}
-!8 = !{i32 0, %"class.RWTexture2D<vector<float, 4> >"* bitcast (%dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" to %"class.RWTexture2D<vector<float, 4> >"*), !"RenderTarget", i32 0, i32 0, i32 1, i32 2, i1 false, i1 false, i1 false, !9}
-!9 = !{i32 0, i32 9}
-!10 = !{i32 1, void (%struct.RayPayload*, %struct.BuiltInTriangleIntersectionAttributes*)* @ClosestHit, !11}
-!11 = !{!12}
-!12 = !{i32 1, !13, !13}
-!13 = !{}
-!18 = !{null, !"", null, !3, !19}
-!19 = !{i32 0, i64 65536}
-!22 = !{i32 0}
-!29 = !{void (%struct.RayPayload*, %struct.BuiltInTriangleIntersectionAttributes*)* @ClosestHit, !"ClosestHit", null, null, !30}
-!30 = !{i32 8, i32 10, i32 5, !22}
-!31 = !{null, %struct.RayPayload poison, %struct.BuiltInTriangleIntersectionAttributes poison}
-!32 = !{i32 0, %struct.RayPayload poison}
-!33 = !{i32 0, %struct.BuiltInTriangleIntersectionAttributes poison}
diff --git a/llvmraytracing/test/dx/dxil-cont-post-process.ll b/llvmraytracing/test/dx/dxil-cont-post-process.ll
deleted file mode 100644
index 9fdd78be84..0000000000
--- a/llvmraytracing/test/dx/dxil-cont-post-process.ll
+++ /dev/null
@@ -1,37 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='dxil-cont-post-process,lint' -S %s --lint-abort-on-error | FileCheck %s
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-%struct.DispatchSystemData = type { i32 }
-
-@debug_global = external global i1
-
-declare i32 @_cont_GetContinuationStackAddr()
-declare i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
-
-define void @RayGen(i32 %cspInit, i32 %dummyRetAddr, %struct.DispatchSystemData %0) !lgc.rt.shaderstage !3 !continuation.entry !4 !continuation !5 {
-; CHECK-LABEL: define void @RayGen(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[DUMMYRETADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META3:![0-9]+]] !continuation.entry [[META4:![0-9]+]] !continuation [[META5:![0-9]+]] {
-; CHECK-NEXT:    ret void
-;
-  ret void
-}
-
-define void @RayGen.resume.0(i32 %cspInit, i32 %0, %struct.DispatchSystemData %1) !lgc.rt.shaderstage !3 !continuation !5 {
-; CHECK-LABEL: define void @RayGen.resume.0(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP1:%.*]]) !lgc.rt.shaderstage [[META3]] !continuation [[META5]] {
-; CHECK-NEXT:    ret void
-;
-  ret void
-}
-
-!dx.entryPoints = !{!0}
-!continuation.stackAddrspace = !{!2}
-
-!0 = !{ptr @RayGen, !"RayGen", null, null, !1}
-!1 = !{i32 8, i32 7}
-!2 = !{i32 21}
-!3 = !{i32 0}
-!4 = !{}
-!5 = !{ptr @RayGen}
diff --git a/llvmraytracing/test/dx/dxil-cont-prepare-gpurt-library-remove-waitmask.ll b/llvmraytracing/test/dx/dxil-cont-prepare-gpurt-library-remove-waitmask.ll
deleted file mode 100644
index e8b6b2646e..0000000000
--- a/llvmraytracing/test/dx/dxil-cont-prepare-gpurt-library-remove-waitmask.ll
+++ /dev/null
@@ -1,90 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
-; RUN: opt --verify-each -passes='dxil-cont-prepare-gpurt-library,lint' -S %s --lint-abort-on-error | FileCheck %s
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-%struct.DispatchSystemData = type { i32 }
-%struct.TraversalData = type { %struct.SystemData, i32, i64 }
-%struct.SystemData = type { %struct.DispatchSystemData, float }
-
-; Function Attrs: nounwind memory(none)
-define i32 @_cont_GetContinuationStackAddr() #0 {
-  ret i32 1
-}
-
-; Function Attrs: nounwind
-define void @_cont_TraceRay(%struct.DispatchSystemData* noalias nocapture sret(%struct.DispatchSystemData) %agg.result, %struct.DispatchSystemData* nocapture readonly %data, i64 %accelStruct, i32 %rayFlags, i32 %instanceInclusioMask, i32 %rayContributionToHitGroupIndex, i32 %multiplierForGeometryContributionToShaderIndex, i32 %missShaderIndex, float %originX, float %originY, float %originZ, float %tMin, float %dirX, float %dirY, float %dirZ, float %tMax) #1 !pointeetys !2 {
-  %1 = alloca %struct.TraversalData, align 4
-  %2 = alloca %struct.DispatchSystemData, align 4
-  %3 = getelementptr inbounds %struct.DispatchSystemData, %struct.DispatchSystemData* %data, i32 0, i32 0
-  %4 = load i32, i32* %3, align 4
-  %5 = bitcast %struct.TraversalData* %1 to i8*
-  call void @llvm.lifetime.start.p0i8(i64 12, i8* %5) #3
-  %6 = getelementptr inbounds %struct.TraversalData, %struct.TraversalData* %1, i32 0, i32 0, i32 0, i32 0
-  store i32 %4, i32* %6, align 4
-  %addr = call i64 @_AmdGetResumePointAddr() #3
-  %a = getelementptr inbounds %struct.TraversalData, %struct.TraversalData* %1, i32 0, i32 2
-  store i64 %addr, i64* %a, align 4
-  call void @"\01?_AmdWaitAwait@@YA?AUDispatchSystemData@@UTraversalData@@@Z"(%struct.DispatchSystemData* nonnull sret(%struct.DispatchSystemData) %2, i64 3, i64 -1, %struct.TraversalData* nonnull %1) #3
-  %7 = getelementptr inbounds %struct.DispatchSystemData, %struct.DispatchSystemData* %2, i32 0, i32 0
-  %8 = load i32, i32* %7, align 4
-  %9 = getelementptr inbounds %struct.DispatchSystemData, %struct.DispatchSystemData* %agg.result, i32 0, i32 0
-  store i32 %8, i32* %9, align 4
-  call void @llvm.lifetime.end.p0i8(i64 12, i8* %5) #3
-  ret void
-}
-
-declare !pointeetys !3 void @"\01?_AmdWaitAwait@@YA?AUDispatchSystemData@@UTraversalData@@@Z"(%struct.DispatchSystemData* sret(%struct.DispatchSystemData), i64, i64, %struct.TraversalData*) #2
-
-; Function Attrs: nounwind
-declare i64 @_AmdGetResumePointAddr() #3
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare !pointeetys !5 void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #4
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare !pointeetys !5 void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #4
-
-attributes #0 = { nounwind memory(none) "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
-attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
-
-!0 = !{%struct.DispatchSystemData poison}
-!1 = !{i32 0, %struct.DispatchSystemData poison}
-!2 = !{null, %struct.DispatchSystemData poison, %struct.DispatchSystemData poison}
-!3 = !{null, %struct.DispatchSystemData poison, null, null, %struct.TraversalData poison}
-!4 = !{i32 0, %struct.TraversalData poison}
-!5 = !{i8 poison}
-!6 = !{i32 0, i8 poison}
-; CHECK-LABEL: define i32 @_cont_GetContinuationStackAddr(
-; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    ret i32 1
-;
-;
-; CHECK-LABEL: define %struct.DispatchSystemData @_cont_TraceRay(
-; CHECK-SAME: ptr nocapture readonly [[DATA:%.*]], i64 [[ACCELSTRUCT:%.*]], i32 [[RAYFLAGS:%.*]], i32 [[INSTANCEINCLUSIOMASK:%.*]], i32 [[RAYCONTRIBUTIONTOHITGROUPINDEX:%.*]], i32 [[MULTIPLIERFORGEOMETRYCONTRIBUTIONTOSHADERINDEX:%.*]], i32 [[MISSSHADERINDEX:%.*]], float [[ORIGINX:%.*]], float [[ORIGINY:%.*]], float [[ORIGINZ:%.*]], float [[TMIN:%.*]], float [[DIRX:%.*]], float [[DIRY:%.*]], float [[DIRZ:%.*]], float [[TMAX:%.*]]) #[[ATTR1:[0-9]+]] !pointeetys [[META0:![0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_TRAVERSALDATA:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[DATA]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast ptr [[TMP1]] to ptr
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr [[TMP6]]) #[[ATTR5:[0-9]+]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[TMP1]], i32 0, i32 0, i32 0, i32 0
-; CHECK-NEXT:    store i32 [[TMP5]], ptr [[TMP7]], align 4
-; CHECK-NEXT:    [[ADDR:%.*]] = call i64 @_AmdGetResumePointAddr() #[[ATTR5]]
-; CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[TMP1]], i32 0, i32 2
-; CHECK-NEXT:    store i64 [[ADDR]], ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] @[[_AMDAWAIT:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i64 3, [[STRUCT_TRAVERSALDATA]] [[TMP8]]) #[[ATTR5]], !waitmask [[META1:![0-9]+]]
-; CHECK-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP9]], ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP2]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP3]], i32 0, i32 0
-; CHECK-NEXT:    store i32 [[TMP11]], ptr [[TMP12]], align 4
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr [[TMP6]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP3]], align 4
-; CHECK-NEXT:    ret [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP13]]
-;
diff --git a/llvmraytracing/test/dx/dxil-cont-prepare-gpurt-library.ll b/llvmraytracing/test/dx/dxil-cont-prepare-gpurt-library.ll
deleted file mode 100644
index 29129961a8..0000000000
--- a/llvmraytracing/test/dx/dxil-cont-prepare-gpurt-library.ll
+++ /dev/null
@@ -1,90 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
-; RUN: opt --verify-each -passes='dxil-cont-prepare-gpurt-library,lint' -S %s --lint-abort-on-error | FileCheck %s
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-%struct.DispatchSystemData = type { i32 }
-%struct.TraversalData = type { %struct.SystemData, i32, i32 }
-%struct.SystemData = type { %struct.DispatchSystemData, float }
-
-; Function Attrs: nounwind memory(none)
-define i32 @_cont_GetContinuationStackAddr() #0 {
-  ret i32 1
-}
-
-; Function Attrs: nounwind
-define void @_cont_TraceRay(%struct.DispatchSystemData* noalias nocapture sret(%struct.DispatchSystemData) %agg.result, %struct.DispatchSystemData* nocapture readonly %data, i64 %accelStruct, i32 %rayFlags, i32 %instanceInclusioMask, i32 %rayContributionToHitGroupIndex, i32 %multiplierForGeometryContributionToShaderIndex, i32 %missShaderIndex, float %originX, float %originY, float %originZ, float %tMin, float %dirX, float %dirY, float %dirZ, float %tMax) #1 !pointeetys !2 {
-  %1 = alloca %struct.TraversalData, align 4
-  %2 = alloca %struct.DispatchSystemData, align 4
-  %3 = getelementptr inbounds %struct.DispatchSystemData, %struct.DispatchSystemData* %data, i32 0, i32 0
-  %4 = load i32, i32* %3, align 4
-  %5 = bitcast %struct.TraversalData* %1 to i8*
-  call void @llvm.lifetime.start.p0i8(i64 12, i8* %5) #3
-  %6 = getelementptr inbounds %struct.TraversalData, %struct.TraversalData* %1, i32 0, i32 0, i32 0, i32 0
-  store i32 %4, i32* %6, align 4
-  %addr = call i32 @_AmdGetResumePointAddr() #3
-  %a = getelementptr inbounds %struct.TraversalData, %struct.TraversalData* %1, i32 0, i32 2
-  store i32 %addr, i32* %a, align 4
-  call void @"\01?_AmdAwait@@YA?AUDispatchSystemData@@UTraversalData@@@Z"(%struct.DispatchSystemData* nonnull sret(%struct.DispatchSystemData) %2, i64 3, %struct.TraversalData* nonnull %1) #3
-  %7 = getelementptr inbounds %struct.DispatchSystemData, %struct.DispatchSystemData* %2, i32 0, i32 0
-  %8 = load i32, i32* %7, align 4
-  %9 = getelementptr inbounds %struct.DispatchSystemData, %struct.DispatchSystemData* %agg.result, i32 0, i32 0
-  store i32 %8, i32* %9, align 4
-  call void @llvm.lifetime.end.p0i8(i64 12, i8* %5) #3
-  ret void
-}
-
-declare !pointeetys !3 void @"\01?_AmdAwait@@YA?AUDispatchSystemData@@UTraversalData@@@Z"(%struct.DispatchSystemData* sret(%struct.DispatchSystemData), i64, %struct.TraversalData*) #2
-
-; Function Attrs: nounwind
-declare i32 @_AmdGetResumePointAddr() #3
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare !pointeetys !5 void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #4
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare !pointeetys !5 void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #4
-
-attributes #0 = { nounwind memory(none) "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
-attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
-
-!0 = !{%struct.DispatchSystemData poison}
-!1 = !{i32 0, %struct.DispatchSystemData poison}
-!2 = !{null, %struct.DispatchSystemData poison, %struct.DispatchSystemData poison}
-!3 = !{null, %struct.DispatchSystemData poison, null, %struct.TraversalData poison}
-!4 = !{i32 0, %struct.TraversalData poison}
-!5 = !{i8 poison}
-!6 = !{i32 0, i8 poison}
-; CHECK-LABEL: define i32 @_cont_GetContinuationStackAddr(
-; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    ret i32 1
-;
-;
-; CHECK-LABEL: define %struct.DispatchSystemData @_cont_TraceRay(
-; CHECK-SAME: ptr nocapture readonly [[DATA:%.*]], i64 [[ACCELSTRUCT:%.*]], i32 [[RAYFLAGS:%.*]], i32 [[INSTANCEINCLUSIOMASK:%.*]], i32 [[RAYCONTRIBUTIONTOHITGROUPINDEX:%.*]], i32 [[MULTIPLIERFORGEOMETRYCONTRIBUTIONTOSHADERINDEX:%.*]], i32 [[MISSSHADERINDEX:%.*]], float [[ORIGINX:%.*]], float [[ORIGINY:%.*]], float [[ORIGINZ:%.*]], float [[TMIN:%.*]], float [[DIRX:%.*]], float [[DIRY:%.*]], float [[DIRZ:%.*]], float [[TMAX:%.*]]) #[[ATTR1:[0-9]+]] !pointeetys [[META0:![0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_TRAVERSALDATA:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[DATA]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast ptr [[TMP1]] to ptr
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr [[TMP6]]) #[[ATTR5:[0-9]+]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[TMP1]], i32 0, i32 0, i32 0, i32 0
-; CHECK-NEXT:    store i32 [[TMP5]], ptr [[TMP7]], align 4
-; CHECK-NEXT:    [[ADDR:%.*]] = call i32 @_AmdGetResumePointAddr() #[[ATTR5]]
-; CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[TMP1]], i32 0, i32 2
-; CHECK-NEXT:    store i32 [[ADDR]], ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] @[[_AMDAWAIT:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i64 3, [[STRUCT_TRAVERSALDATA]] [[TMP8]])
-; CHECK-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP9]], ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP2]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP3]], i32 0, i32 0
-; CHECK-NEXT:    store i32 [[TMP11]], ptr [[TMP12]], align 4
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr [[TMP6]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP3]], align 4
-; CHECK-NEXT:    ret [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP13]]
-;
diff --git a/llvmraytracing/test/dx/free-raygen-cont-state-in-persistent-launch.ll b/llvmraytracing/test/dx/free-raygen-cont-state-in-persistent-launch.ll
deleted file mode 100644
index a4d0cf6847..0000000000
--- a/llvmraytracing/test/dx/free-raygen-cont-state-in-persistent-launch.ll
+++ /dev/null
@@ -1,282 +0,0 @@
-; NOTE: Do not autogenerate
-; Tests that if _cont_ExitRayGen ends with an enqueue, then we still free RayGen continuation state.
-; This is a regression test, in an earlier version we only freed for returns and missed this case.
-; RUN: grep -v "lgc.cps.module" %s | opt --verify-each -passes="dxil-cont-prepare-gpurt-library,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,cleanup-continuations,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck %s
-; RUN: opt --verify-each -passes="dxil-cont-prepare-gpurt-library,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,cleanup-continuations,lint,remove-types-metadata" -S %s --lint-abort-on-error | FileCheck %s
-
-; There is just a single RayGen shader in this module, so any free must come from it.
-; lgc.cps.free is lowered during cleanup-continuations.
-
-; CHECK-LABEL: define void @MyRayGen
-; CHECK: [[CSP:%.*]] = alloca i32, align 4
-
-; alloc(VALUE)
-; CHECK: [[LOAD:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK: [[NEW:%.*]] = add i32 [[LOAD]], [[VALUE:[0-9]+]]
-; CHECK: store i32 [[NEW]], ptr [[CSP]], align 4
-
-; jump
-; CHECK: [[LOAD2:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK: call void (...) @lgc.cps.jump({{i64|i32}} {{.*}}, i32 {{.*}}, i32 [[LOAD2]]
-
-; CHECK-LABEL: define dso_local void @MyRayGen.resume.0
-; CHECK: [[CSP2:%.*]] = alloca i32, align 4
-
-; peek(VALUE)
-; CHECK: [[LOAD3:%.*]] = load i32, ptr [[CSP2]], align 4
-; CHECK: [[NEW3:%.*]] = add i32 [[LOAD3]], -[[VALUE]]
-
-; free(VALUE)
-; CHECK: [[LOAD4:%.*]] = load i32, ptr [[CSP2]], align 4
-; CHECK: [[NEW4:%.*]] = add i32 [[LOAD4]], -[[VALUE]]
-; CHECK: store i32 [[NEW4]], ptr [[CSP2]], align 4
-
-; jump
-; CHECK: [[LOAD5:%.*]] = load i32, ptr [[CSP2]], align 4
-; CHECK: call void (...) @lgc.cps.jump({{i64|i32}} {{.*}}, i32 {{.*}}, i32 [[LOAD5]]
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-%dx.types.Handle = type { i8* }
-%struct.DispatchSystemData = type { <3 x i32> }
-%struct.TraversalData = type { %struct.SystemData, %struct.HitData, <3 x float>, <3 x float>, float, i32 }
-%struct.SystemData = type { %struct.DispatchSystemData }
-%struct.HitData = type { <3 x float>, <3 x float>, float, i32 }
-%struct.AnyHitTraversalData = type { %struct.TraversalData, %struct.HitData }
-%struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
-%struct.RayPayload = type { <4 x float> }
-%dx.types.ResourceProperties = type { i32, i32 }
-%struct.BuiltInTriangleIntersectionAttributes2 = type { <2 x float> }
-%struct.RaytracingAccelerationStructure = type { i32 }
-%"class.RWTexture2D<vector<float, 4> >" = type { <4 x float> }
-
-@"\01?Scene@@3URaytracingAccelerationStructure@@A" = external constant %dx.types.Handle, align 4
-@"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" = external constant %dx.types.Handle, align 4
-
-define i32 @_cont_GetContinuationStackAddr() #0 {
-  ret i32 0
-}
-
-declare void @_AmdEnqueue(i32, i32, %struct.SystemData)
-
-define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !pointeetys !{%struct.DispatchSystemData poison} {
-  call void @_AmdEnqueue(i32 1, i32 1, %struct.SystemData poison)
-  unreachable
-}
-
-declare %struct.DispatchSystemData @_AmdAwaitTraversal(i32, %struct.TraversalData) #0
-
-declare !pointeetys !32 %struct.HitData @_cont_GetCandidateState(%struct.AnyHitTraversalData* %data) #0
-
-declare !pointeetys !34 %struct.HitData @_cont_GetCommittedState(%struct.SystemData*) #0
-
-declare !pointeetys !36 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData*) #0
-
-declare !pointeetys !37 void @_cont_SetTriangleHitAttributes(%struct.SystemData* %data, %struct.BuiltInTriangleIntersectionAttributes %val)
-
-define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) !pointeetys !38 {
-  ret i32 5
-}
-
-declare i1 @opaqueIsEnd()
-
-define i1 @_cont_IsEndSearch(%struct.TraversalData*) #0 !pointeetys !40 {
-  %isEnd = call i1 @opaqueIsEnd()
-  ret i1 %isEnd
-}
-
-declare !pointeetys !42 i32 @_cont_HitKind(%struct.SystemData*) #0
-
-; Function Attrs: nounwind
-declare i32 @_AmdGetResumePointAddr() #1
-
-; Function Attrs: nounwind
-declare !pointeetys !43 void @_AmdRestoreSystemData(%struct.DispatchSystemData*) #1
-
-; Function Attrs: nounwind
-declare !pointeetys !44 void @_AmdRestoreSystemDataAnyHit(%struct.AnyHitTraversalData*) #1
-
-; Function Attrs: nounwind
-declare !pointeetys !43 void @_cont_AcceptHitAndEndSearch(%struct.DispatchSystemData* nocapture readnone) #1
-
-; Function Attrs: nounwind
-declare !pointeetys !44 void @_cont_AcceptHit(%struct.AnyHitTraversalData* nocapture readnone) #1
-
-; Function Attrs: nounwind
-declare !pointeetys !43 void @_cont_IgnoreHit(%struct.DispatchSystemData* nocapture readnone) #1
-
-; Function Attrs: nounwind
-declare !pointeetys !44 void @_AmdAcceptHitAttributes(%struct.AnyHitTraversalData* nocapture readnone) #1
-
-define void @_cont_TraceRay(%struct.DispatchSystemData* %data, i64 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, float %6, float %7, float %8, float %9, float %10, float %11, float %12, float %13) #0 !pointeetys !45 {
-  %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4
-  %sys_data = insertvalue %struct.SystemData undef, %struct.DispatchSystemData %dis_data, 0
-  %trav_data = insertvalue %struct.TraversalData undef, %struct.SystemData %sys_data, 0
-  %addr = call i32 @_AmdGetResumePointAddr() #3
-  %trav_data2 = insertvalue %struct.TraversalData %trav_data, i32 %addr, 5
-  %newdata = call %struct.DispatchSystemData @_AmdAwaitTraversal(i32 4, %struct.TraversalData %trav_data2)
-  store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4
-  call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data)
-  ret void
-}
-
-declare !pointeetys !46 void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) #0;
-
-declare !pointeetys !47 i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind) #0
-
-declare !pointeetys !48 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData* %data)
-
-declare !pointeetys !49 <3 x float> @_cont_ObjectRayOrigin3(%struct.DispatchSystemData* nocapture readnone %data, %struct.HitData* %hitData)
-
-declare  !pointeetys !49 <3 x float> @_cont_ObjectRayDirection3(%struct.DispatchSystemData* nocapture readnone %data, %struct.HitData* %hitData)
-
-declare !pointeetys !51 float @_cont_RayTCurrent(%struct.DispatchSystemData* nocapture readnone %data, %struct.HitData* %hitData)
-
-declare i32 @opaque()
-declare void @use(i32)
-
-; Function Attrs: nounwind
-define void @MyRayGen() #2 {
-  %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
-  %2 = load %dx.types.Handle, %dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
-  %3 = alloca %struct.RayPayload, align 4
-  %4 = bitcast %struct.RayPayload* %3 to i8*
-  call void @llvm.lifetime.start.p0i8(i64 16, i8* %4) #1
-  %5 = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %3, i32 0, i32 0
-  store <4 x float> zeroinitializer, <4 x float>* %5, align 4, !tbaa !52
-  %6 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1)
-  %7 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %6, %dx.types.ResourceProperties { i32 16, i32 0 })
-  ; Ensure continuation state
-  %cont.state = call i32 @opaque()
-  call void @dx.op.traceRay.struct.RayPayload(i32 157, %dx.types.Handle %7, i32 16, i32 -1, i32 0, i32 1, i32 0, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0x3F50624DE0000000, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+04, %struct.RayPayload* nonnull %3)
-  call void @use(i32 %cont.state)
-  %8 = load <4 x float>, <4 x float>* %5, align 4, !tbaa !52
-  %9 = call i32 @dx.op.dispatchRaysIndex.i32(i32 145, i8 0)
-  %10 = call i32 @dx.op.dispatchRaysIndex.i32(i32 145, i8 1)
-  %11 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %2)
-  %12 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %11, %dx.types.ResourceProperties { i32 4098, i32 1033 })
-  %13 = extractelement <4 x float> %8, i64 0
-  %14 = extractelement <4 x float> %8, i64 1
-  %15 = extractelement <4 x float> %8, i64 2
-  %16 = extractelement <4 x float> %8, i64 3
-  call void @dx.op.textureStore.f32(i32 67, %dx.types.Handle %12, i32 %9, i32 %10, i32 undef, float %13, float %14, float %15, float %16, i8 15)
-  call void @llvm.lifetime.end.p0i8(i64 16, i8* %4) #1
-  ret void
-}
-
-; Function Attrs: nounwind
-declare !pointeetys !59 void @dx.op.traceRay.struct.RayPayload(i32, %dx.types.Handle, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, %struct.RayPayload*) #1
-
-; Function Attrs: nounwind
-declare void @dx.op.textureStore.f32(i32, %dx.types.Handle, i32, i32, i32, float, float, float, float, i8) #1
-
-; Function Attrs: nounwind memory(none)
-declare i32 @dx.op.dispatchRaysIndex.i32(i32, i8) #3
-
-; Function Attrs: nounwind memory(none)
-declare float @dx.op.objectRayDirection.f32(i32, i8) #3
-
-; Function Attrs: nounwind memory(none)
-declare float @dx.op.objectRayOrigin.f32(i32, i8) #3
-
-; Function Attrs: nounwind memory(read)
-declare float @dx.op.rayTCurrent.f32(i32) #4
-
-declare void @dx.op.acceptHitAndEndSearch(i32) #0
-
-declare void @dx.op.ignoreHit(i32) #0
-
-; Function Attrs: nounwind
-declare !pointeetys !60 i1 @dx.op.reportHit.struct.BuiltInTriangleIntersectionAttributes(i32, float, i32, %struct.BuiltInTriangleIntersectionAttributes*) #1
-
-; Function Attrs: nounwind
-declare !pointeetys !61 i1 @dx.op.reportHit.struct.BuiltInTriangleIntersectionAttributes2(i32, float, i32, %struct.BuiltInTriangleIntersectionAttributes2*) #1
-
-; Function Attrs: nounwind memory(none)
-declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #3
-
-; Function Attrs: nounwind memory(read)
-declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.types.Handle) #4
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare !pointeetys !63 void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #5
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare !pointeetys !63 void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #5
-
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-attributes #2 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind memory(none) }
-attributes #4 = { nounwind memory(read) }
-attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
-
-!llvm.ident = !{!0}
-!dx.version = !{!1}
-!dx.valver = !{!1}
-!dx.shaderModel = !{!2}
-!dx.resources = !{!3}
-!dx.typeAnnotations = !{!10}
-!dx.entryPoints = !{!18, !29 }
-!lgc.cps.module = !{}
-
-!0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"}
-!1 = !{i32 1, i32 6}
-!2 = !{!"lib", i32 6, i32 6}
-!3 = !{!4, !7, null, null}
-!4 = !{!5}
-!5 = !{i32 0, %struct.RaytracingAccelerationStructure* bitcast (%dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A" to %struct.RaytracingAccelerationStructure*), !"Scene", i32 0, i32 0, i32 1, i32 16, i32 0, !6}
-!6 = !{i32 0, i32 4}
-!7 = !{!8}
-!8 = !{i32 0, %"class.RWTexture2D<vector<float, 4> >"* bitcast (%dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" to %"class.RWTexture2D<vector<float, 4> >"*), !"RenderTarget", i32 0, i32 0, i32 1, i32 2, i1 false, i1 false, i1 false, !9}
-!9 = !{i32 0, i32 9}
-!10 = !{i32 1, void ()* @MyRayGen, !11}
-!11 = !{!12}
-!12 = !{i32 1, !13, !13}
-!13 = !{}
-!14 = !{!12, !15, !16}
-!15 = !{i32 2, !13, !13}
-!16 = !{i32 0, !13, !13}
-!17 = !{!12, !15}
-!18 = !{null, !"", null, !3, !19}
-!19 = !{i32 0, i64 65536}
-!21 = !{i32 8, i32 9, i32 6, i32 16, i32 7, i32 8, i32 5, !22}
-!22 = !{i32 0}
-!24 = !{i32 8, i32 10, i32 6, i32 16, i32 7, i32 8, i32 5, !22}
-!26 = !{i32 8, i32 8, i32 5, !22}
-!28 = !{i32 8, i32 11, i32 6, i32 16, i32 5, !22}
-!29 = !{void ()* @MyRayGen, !"MyRayGen", null, null, !30}
-!30 = !{i32 8, i32 7, i32 5, !22}
-!32 = !{%struct.AnyHitTraversalData poison}
-!33 = !{i32 0, %struct.AnyHitTraversalData poison}
-!34 = !{%struct.SystemData poison}
-!35 = !{i32 0, %struct.SystemData poison}
-!36 = !{%struct.SystemData poison}
-!37 = !{%struct.SystemData poison}
-!38 = !{%struct.DispatchSystemData poison}
-!39 = !{i32 0, %struct.DispatchSystemData poison}
-!40 = !{%struct.TraversalData poison}
-!41 = !{i32 0, %struct.TraversalData poison}
-!42 = !{%struct.SystemData poison}
-!43 = !{%struct.DispatchSystemData poison}
-!44 = !{%struct.AnyHitTraversalData poison}
-!45 = !{%struct.DispatchSystemData poison}
-!46 = !{%struct.DispatchSystemData poison}
-!47 = !{%struct.AnyHitTraversalData poison}
-!48 = !{%struct.DispatchSystemData poison}
-!49 = !{null, %struct.DispatchSystemData poison, %struct.HitData poison}
-!50 = !{i32 0, %struct.HitData poison}
-!51 = !{null, %struct.DispatchSystemData poison, %struct.HitData poison}
-!52 = !{!53, !53, i64 0}
-!53 = !{!"omnipotent char", !54, i64 0}
-!54 = !{!"Simple C/C++ TBAA"}
-!55 = !{null, %struct.RayPayload poison, %struct.BuiltInTriangleIntersectionAttributes poison}
-!56 = !{i32 0, %struct.RayPayload poison}
-!57 = !{i32 0, %struct.BuiltInTriangleIntersectionAttributes poison}
-!58 = !{%struct.RayPayload poison}
-!59 = !{%struct.RayPayload poison}
-!60 = !{%struct.BuiltInTriangleIntersectionAttributes poison}
-!61 = !{%struct.BuiltInTriangleIntersectionAttributes2 poison}
-!62 = !{i32 0, %struct.BuiltInTriangleIntersectionAttributes2 poison}
-!63 = !{i8 poison}
-!64 = !{i32 0, i8 poison}
diff --git a/llvmraytracing/test/dx/inline-const-jump-target.ll b/llvmraytracing/test/dx/inline-const-jump-target.ll
deleted file mode 100644
index 2d20231e67..0000000000
--- a/llvmraytracing/test/dx/inline-const-jump-target.ll
+++ /dev/null
@@ -1,158 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata" -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE-CPS %s
-; RUN: opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,lgc-cps-jump-inliner,lint,remove-types-metadata" -S %s --lint-abort-on-error | FileCheck -check-prefix=JUMP-INLINER-CPS %s
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-%dx.types.Handle = type { i8* }
-%struct.DispatchSystemData = type { i32 }
-%struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
-%struct.SystemData = type { %struct.DispatchSystemData }
-%struct.TraversalData = type { %struct.SystemData, %struct.HitData, <3 x float>, <3 x float>, float }
-%struct.HitData = type { float, i32 }
-%struct.AnyHitTraversalData = type { %struct.TraversalData, %struct.HitData }
-%struct.TheirParams = type { i32 }
-%struct.Payload = type {}
-%"class.RWTexture2D<vector<float, 4> >" = type { <4 x float> }
-
-@debug_global = external global i32
-
-@"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" = external constant %dx.types.Handle, align 4
-
-declare i32 @lgc.rt.shader.index()
-
-declare i32 @_cont_GetContinuationStackAddr()
-
-declare !pointeetys !13 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
-
-define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) !pointeetys !13 {
-; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define i32 @_cont_GetLocalRootIndex(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: ptr [[DATA:%.*]]) {
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    ret i32 5
-;
-; JUMP-INLINER-CPS-LABEL: define i32 @_cont_GetLocalRootIndex(
-; JUMP-INLINER-CPS-SAME: ptr [[DATA:%.*]]) {
-; JUMP-INLINER-CPS-NEXT:    ret i32 5
-;
-  ret i32 5
-}
-
-; Need _cont_ReportHit to get system data type
-declare !pointeetys !21 i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind)
-
-declare void @lgc.cps.jump(...) #1
-declare i32 @get.ret.addr()
-
-declare !pointeetys !15 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData*)
-
-declare !pointeetys !13 void @_AmdRestoreSystemData(%struct.DispatchSystemData*)
-declare i32 @_AmdGetFuncAddrCallable()
-
-define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !pointeetys !13 {
-  ret void
-}
-
-define internal void @Callable(%struct.Payload* %payload) !pointeetys !23 !lgc.rt.shaderstage !25 {
-; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define internal void @Callable(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [0 x i32] [[PADDING:%.*]], [0 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META15:![0-9]+]] !lgc.cps [[META16:![0-9]+]] !continuation.registercount [[META8:![0-9]+]] !continuation [[META17:![0-9]+]] {
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:  entry:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [0 x i32], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP0:%.*]] = alloca [[STRUCT_PAYLOAD:%.*]], align 8
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[SYSTEM_DATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[SHADER_INDEX]], ptr @debug_global, align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, i32 poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]]), !continuation.registercount [[META8]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
-;
-entry:
-  %val = call i32 @lgc.rt.shader.index()
-  store i32 %val, ptr @debug_global
-  ret void
-}
-
-define void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) !pointeetys !13 {
-  %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4
-  %callable.addr = call i32 @_AmdGetFuncAddrCallable()
-  %ret.addr = call i32 @get.ret.addr()
-  call void (...) @lgc.cps.jump(i32 %callable.addr, i32 2, i32 poison, i32 %ret.addr, i32 999, %struct.DispatchSystemData %dis_data, {} poison, [0 x i32] poison, [0 x i32] poison)
-  unreachable
-}
-
-define void @main() {
-; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @main(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8]] !lgc.cps [[META18:![0-9]+]] !continuation.registercount [[META8]] !continuation [[META19:![0-9]+]] {
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[PARAMS:%.*]] = alloca [[STRUCT_THEIRPARAMS:%.*]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [1 x i32], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DIS_DATA_I:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @Callable)
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[RET_ADDR_I:%.*]] = call i32 @get.ret.addr()
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[TMP2]], i32 2, i32 poison, i32 [[RET_ADDR_I]], i32 999, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], {} poison, [0 x i32] poison, [0 x i32] poison)
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
-; LOWERRAYTRACINGPIPELINE-CPS:       _cont_CallShader.exit:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @lgc.cps.complete()
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
-;
-; JUMP-INLINER-CPS-LABEL: define void @main(
-; JUMP-INLINER-CPS-SAME: i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !lgc.cps [[META15:![0-9]+]] !continuation.registercount [[META8]] !continuation [[META16:![0-9]+]] {
-; JUMP-INLINER-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA_I:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
-; JUMP-INLINER-CPS-NEXT:    [[PARAMS:%.*]] = alloca [[STRUCT_THEIRPARAMS:%.*]], align 4
-; JUMP-INLINER-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
-; JUMP-INLINER-CPS-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [1 x i32], align 4
-; JUMP-INLINER-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; JUMP-INLINER-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
-; JUMP-INLINER-CPS-NEXT:    [[DIS_DATA_I:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; JUMP-INLINER-CPS-NEXT:    [[RET_ADDR_I:%.*]] = call i32 @get.ret.addr()
-; JUMP-INLINER-CPS-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[SYSTEM_DATA_ALLOCA_I]])
-; JUMP-INLINER-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], ptr [[SYSTEM_DATA_ALLOCA_I]], align 4
-; JUMP-INLINER-CPS-NEXT:    store i32 999, ptr @debug_global, align 4
-; JUMP-INLINER-CPS-NEXT:    [[TMP2:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA_I]], align 4
-; JUMP-INLINER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RET_ADDR_I]], i32 6, i32 poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP2]]), !continuation.registercount [[META8]]
-; JUMP-INLINER-CPS-NEXT:    unreachable
-; JUMP-INLINER-CPS:       Callable.exit:
-; JUMP-INLINER-CPS-NEXT:    unreachable
-; JUMP-INLINER-CPS:       _cont_CallShader.exit:
-; JUMP-INLINER-CPS-NEXT:    call void @lgc.cps.complete()
-; JUMP-INLINER-CPS-NEXT:    unreachable
-;
-  %params = alloca %struct.TheirParams, align 4
-  call void @dx.op.callShader.struct.TheirParams(i32 159, i32 1, %struct.TheirParams* nonnull %params)
-  ret void
-}
-
-; Function Attrs: nounwind
-declare !pointeetys !19 void @dx.op.callShader.struct.TheirParams(i32, i32, %struct.TheirParams*) #0
-
-attributes #0 = { nounwind }
-
-!llvm.ident = !{!0}
-!dx.version = !{!1}
-!dx.valver = !{!1}
-!dx.shaderModel = !{!2}
-!dx.entryPoints = !{!3, !6}
-!lgc.cps.module = !{}
-
-attributes #1 = { noreturn }
-
-!0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"}
-!1 = !{i32 1, i32 6}
-!2 = !{!"lib", i32 6, i32 6}
-!3 = !{null, !"", null, !4, !12}
-!4 = !{!5, !9, null, null}
-!5 = !{!6}
-!6 = !{void ()* @main, !"main", null, null, !7}
-!7 = !{i32 8, i32 7, i32 6, i32 16, i32 7, i32 8, i32 5, !8}
-!8 = !{i32 0}
-!9 = !{!10}
-!10 = !{i32 0, %"class.RWTexture2D<vector<float, 4> >"* bitcast (%dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" to %"class.RWTexture2D<vector<float, 4> >"*), !"RenderTarget", i32 0, i32 0, i32 1, i32 2, i1 false, i1 false, i1 false, !11}
-!11 = !{i32 0, i32 9}
-!12 = !{i32 0, i64 65536}
-!13 = !{%struct.DispatchSystemData poison}
-!15 = !{%struct.SystemData poison}
-!19 = !{%struct.TheirParams poison}
-!21 = !{%struct.AnyHitTraversalData poison}
-!23 = !{%struct.Payload poison}
-!25 = !{i32 5}
diff --git a/llvmraytracing/test/dx/intersection-registercount.ll b/llvmraytracing/test/dx/intersection-registercount.ll
deleted file mode 100644
index 0500ba35e9..0000000000
--- a/llvmraytracing/test/dx/intersection-registercount.ll
+++ /dev/null
@@ -1,184 +0,0 @@
-; NOTE: Do not autogenerate
-; RUN: opt --verify-each --report-payload-register-sizes=max -passes='dxil-cont-prepare-gpurt-library,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,cleanup-continuations,continuations-stats-report,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error 2>&1 | FileCheck %s
-
-; CHECK: Incoming and max outgoing payload VGPR size of "Intersection" (intersection): 25 and 25 dwords
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-%dx.types.Handle = type { i8* }
-%struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
-%struct.SystemData = type { %struct.DispatchSystemData }
-%struct.DispatchSystemData = type { <3 x i32> }
-%struct.TraversalData = type { %struct.SystemData, %struct.HitData, <3 x float>, <3 x float>, float }
-%struct.HitData = type { float, i32 }
-%struct.AnyHitTraversalData = type { %struct.TraversalData, %struct.HitData }
-%struct.RayPayload = type { <4 x float> }
-%dx.types.ResourceProperties = type { i32, i32 }
-%struct.RaytracingAccelerationStructure = type { i32 }
-%"class.RWTexture2D<vector<float, 4> >" = type { <4 x float> }
-
-@"\01?Scene@@3URaytracingAccelerationStructure@@A" = external constant %dx.types.Handle, align 4
-@"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" = external constant %dx.types.Handle, align 4
-
-declare i64 @_cont_GetTraversalAddr() #0
-
-declare i32 @_cont_GetContinuationStackAddr() #0
-
-declare !pointeetys !16 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData*) #0
-
-declare !pointeetys !18 void @_cont_SetTriangleHitAttributes(%struct.SystemData*, %struct.BuiltInTriangleIntersectionAttributes) #0
-
-declare !pointeetys !19 i1 @_cont_IsEndSearch(%struct.TraversalData*) #0
-
-declare %struct.DispatchSystemData @_cont_Traversal(%struct.TraversalData) #0
-
-declare %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i32, %struct.AnyHitTraversalData, float, i32) #0
-
-declare !pointeetys !21 %struct.HitData @_cont_GetCandidateState(%struct.AnyHitTraversalData*) #0
-
-declare !pointeetys !23 %struct.HitData @_cont_GetCommittedState(%struct.SystemData*) #0
-
-declare !pointeetys !24 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
-
-define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) #0 !pointeetys !24 {
-  ret i32 5
-}
-
-define void @_cont_TraceRay(%struct.DispatchSystemData* %data, i64 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, float %6, float %7, float %8, float %9, float %10, float %11, float %12, float %13) #0 !pointeetys !26 {
-  %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4
-  %sys_data = insertvalue %struct.SystemData undef, %struct.DispatchSystemData %dis_data, 0
-  %trav_data = insertvalue %struct.TraversalData undef, %struct.SystemData %sys_data, 0
-  %newdata = call %struct.DispatchSystemData @_cont_Traversal(%struct.TraversalData %trav_data)
-  store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4
-  ret void
-}
-
-define i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind) #0 !pointeetys !27 {
-  %trav_data = load %struct.AnyHitTraversalData, %struct.AnyHitTraversalData* %data, align 4
-  %newdata = call %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i32 3, %struct.AnyHitTraversalData %trav_data, float %t, i32 %hitKind)
-  store %struct.AnyHitTraversalData %newdata, %struct.AnyHitTraversalData* %data, align 4
-  ret i1 true
-}
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !28 i32 @_cont_DispatchRaysIndex(%struct.DispatchSystemData* nocapture readnone, i32) #1
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !28 i32 @_cont_DispatchRaysDimensions(%struct.DispatchSystemData* nocapture readnone, i32) #1
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !29 float @_cont_WorldRayOrigin(%struct.DispatchSystemData* nocapture readnone, i32) #1
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !29 float @_cont_WorldRayDirection(%struct.DispatchSystemData* nocapture readnone, i32) #1
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !30 float @_cont_RayTMin(%struct.DispatchSystemData* nocapture readnone) #1
-
-; Function Attrs: nounwind memory(read)
-declare !pointeetys !31 float @_cont_RayTCurrent(%struct.DispatchSystemData* nocapture readnone, %struct.HitData*) #2
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !24 i32 @_cont_RayFlags(%struct.DispatchSystemData* nocapture readnone) #1
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !33 i32 @_cont_InstanceIndex(%struct.DispatchSystemData* nocapture readnone, %struct.HitData*) #1
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !33 i32 @_cont_InstanceID(%struct.DispatchSystemData* nocapture readnone, %struct.HitData*) #1
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !33 i32 @_cont_PrimitiveIndex(%struct.DispatchSystemData* nocapture readnone, %struct.HitData*) #1
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !34 float @_cont_ObjectRayOrigin(%struct.DispatchSystemData* nocapture readnone, %struct.HitData*, i32) #1
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !34 float @_cont_ObjectRayDirection(%struct.DispatchSystemData* nocapture readnone, %struct.HitData*, i32) #1
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !35 float @_cont_ObjectToWorld(%struct.DispatchSystemData* nocapture readnone, %struct.HitData*, i32, i32) #1
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !35 float @_cont_WorldToObject(%struct.DispatchSystemData* nocapture readnone, %struct.HitData*, i32, i32) #1
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !36 i32 @_cont_HitKind(%struct.SystemData* nocapture readnone, %struct.HitData*) #1
-
-; Function Attrs: nounwind
-define void @Intersection() #3 !lgc.rt.shaderstage !41 {
-  ret void
-}
-
-; Function Attrs: nounwind memory(read)
-declare !pointeetys !37 void @dx.op.traceRay.struct.RayPayload(i32, %dx.types.Handle, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, %struct.RayPayload*) #2
-
-; Function Attrs: nounwind memory(none)
-declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
-
-declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.types.Handle)
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare !pointeetys !39 void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #4
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare !pointeetys !39 void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #4
-
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind memory(none) }
-attributes #2 = { nounwind memory(read) }
-attributes #3 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
-
-!llvm.ident = !{!0}
-!dx.version = !{!1}
-!dx.valver = !{!1}
-!dx.shaderModel = !{!2}
-!dx.resources = !{!3}
-!dx.typeAnnotations = !{}
-!dx.entryPoints = !{!10, !12}
-!continuation.maxPayloadRegisterCount = !{!15}
-!continuation.minPayloadRegisterCount = !{!14}
-
-!0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"}
-!1 = !{i32 1, i32 6}
-!2 = !{!"lib", i32 6, i32 6}
-!3 = !{!4, !7, null, null}
-!4 = !{!5}
-!5 = !{i32 0, %struct.RaytracingAccelerationStructure* bitcast (%dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A" to %struct.RaytracingAccelerationStructure*), !"Scene", i32 0, i32 0, i32 1, i32 16, i32 0, !6}
-!6 = !{i32 0, i32 4}
-!7 = !{!8}
-!8 = !{i32 0, %"class.RWTexture2D<vector<float, 4> >"* bitcast (%dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" to %"class.RWTexture2D<vector<float, 4> >"*), !"RenderTarget", i32 0, i32 0, i32 1, i32 2, i1 false, i1 false, i1 false, !9}
-!9 = !{i32 0, i32 9}
-!10 = !{null, !"", null, !3, !11}
-!11 = !{i32 0, i64 65536}
-!12 = !{void ()* @Intersection, !"Intersection", null, null, !13}
-!13 = !{i32 8, i32 8, i32 5, !14}
-!14 = !{i32 0}
-!15 = !{i32 25}
-!16 = !{%struct.SystemData poison}
-!17 = !{i32 0, %struct.SystemData poison}
-!18 = !{%struct.SystemData poison}
-!19 = !{%struct.TraversalData poison}
-!20 = !{i32 0, %struct.TraversalData poison}
-!21 = !{%struct.AnyHitTraversalData poison}
-!22 = !{i32 0, %struct.AnyHitTraversalData poison}
-!23 = !{%struct.SystemData poison}
-!24 = !{%struct.DispatchSystemData poison}
-!25 = !{i32 0, %struct.DispatchSystemData poison}
-!26 = !{%struct.DispatchSystemData poison}
-!27 = !{%struct.AnyHitTraversalData poison}
-!28 = !{%struct.DispatchSystemData poison}
-!29 = !{%struct.DispatchSystemData poison}
-!30 = !{%struct.DispatchSystemData poison}
-!31 = !{null, %struct.DispatchSystemData poison, %struct.HitData poison}
-!32 = !{i32 0, %struct.HitData poison}
-!33 = !{null, %struct.DispatchSystemData poison, %struct.HitData poison}
-!34 = !{null, %struct.DispatchSystemData poison, %struct.HitData poison}
-!35 = !{null, %struct.DispatchSystemData poison, %struct.HitData poison}
-!36 = !{null, %struct.SystemData poison, %struct.HitData poison}
-!37 = !{%struct.RayPayload poison}
-!38 = !{i32 0, %struct.RayPayload poison}
-!39 = !{i8 poison}
-!40 = !{i32 0, i8 poison}
-!41 = !{i32 1}
diff --git a/llvmraytracing/test/dx/intrinsics/complete.ll b/llvmraytracing/test/dx/intrinsics/complete.ll
deleted file mode 100644
index fc460fb4da..0000000000
--- a/llvmraytracing/test/dx/intrinsics/complete.ll
+++ /dev/null
@@ -1,76 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='dxil-cont-prepare-gpurt-library,lint,lower-raytracing-pipeline,lint' -S %s --lint-abort-on-error | FileCheck --check-prefix=LOWERRAYTRACINGPIPELINE %s
-; RUN: opt --verify-each -passes='dxil-cont-prepare-gpurt-library,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,cleanup-continuations,lint' -S %s --lint-abort-on-error | FileCheck --check-prefix=CLEANUP %s
-
-%struct.DispatchSystemData = type { i32 }
-%struct.TraversalData = type { i32 }
-
-@debug_global = external global i32
-declare i32 @Val(i32)
-declare void @_AmdComplete()
-declare !pointeetys !2 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
-declare !pointeetys !2 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
-declare !pointeetys !3 i1 @_cont_ReportHit(%struct.TraversalData* %data, float %t, i32 %hitKind)
-
-define void @_cont_Traversal(%struct.TraversalData %data) #1 !lgc.rt.shaderstage !0 {
-; LOWERRAYTRACINGPIPELINE-LABEL: define void @_cont_Traversal(
-; LOWERRAYTRACINGPIPELINE-SAME: i32 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR0:[0-9]+]] !lgc.rt.shaderstage [[META4:![0-9]+]] !continuation [[META5:![0-9]+]] {
-; LOWERRAYTRACINGPIPELINE-NEXT:  AllocaSpillBB:
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_TRAVERSALDATA]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [30 x i32], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [30 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_TRAVERSALDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[VAL:%.*]] = call i32 @Val(i32 5)
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @lgc.cps.complete()
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[VAL]], ptr @debug_global, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
-;
-; CLEANUP-LABEL: define void @_cont_Traversal(
-; CLEANUP-SAME: i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR0:[0-9]+]] !lgc.rt.shaderstage [[META4:![0-9]+]] !continuation [[META5:![0-9]+]] !continuation.state [[META1:![0-9]+]] {
-; CLEANUP-NEXT:  AllocaSpillBB:
-; CLEANUP-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CLEANUP-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; CLEANUP-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 0
-; CLEANUP-NEXT:    [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 1
-; CLEANUP-NEXT:    [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 2
-; CLEANUP-NEXT:    [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 3
-; CLEANUP-NEXT:    [[PAYLOAD_FCA_4_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 4
-; CLEANUP-NEXT:    [[PAYLOAD_FCA_5_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 5
-; CLEANUP-NEXT:    [[PAYLOAD_FCA_6_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 6
-; CLEANUP-NEXT:    [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 7
-; CLEANUP-NEXT:    [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 8
-; CLEANUP-NEXT:    [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 9
-; CLEANUP-NEXT:    [[PAYLOAD_FCA_10_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 10
-; CLEANUP-NEXT:    [[PAYLOAD_FCA_11_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 11
-; CLEANUP-NEXT:    [[PAYLOAD_FCA_12_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 12
-; CLEANUP-NEXT:    [[PAYLOAD_FCA_13_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 13
-; CLEANUP-NEXT:    [[PAYLOAD_FCA_14_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 14
-; CLEANUP-NEXT:    [[PAYLOAD_FCA_15_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 15
-; CLEANUP-NEXT:    [[PAYLOAD_FCA_16_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 16
-; CLEANUP-NEXT:    [[PAYLOAD_FCA_17_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 17
-; CLEANUP-NEXT:    [[PAYLOAD_FCA_18_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 18
-; CLEANUP-NEXT:    [[PAYLOAD_FCA_19_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 19
-; CLEANUP-NEXT:    [[PAYLOAD_FCA_20_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 20
-; CLEANUP-NEXT:    [[PAYLOAD_FCA_21_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 21
-; CLEANUP-NEXT:    [[PAYLOAD_FCA_22_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 22
-; CLEANUP-NEXT:    [[PAYLOAD_FCA_23_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 23
-; CLEANUP-NEXT:    [[PAYLOAD_FCA_24_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 24
-; CLEANUP-NEXT:    [[PAYLOAD_FCA_25_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 25
-; CLEANUP-NEXT:    [[PAYLOAD_FCA_26_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 26
-; CLEANUP-NEXT:    [[PAYLOAD_FCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 27
-; CLEANUP-NEXT:    [[PAYLOAD_FCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 28
-; CLEANUP-NEXT:    [[PAYLOAD_FCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 29
-; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 0
-; CLEANUP-NEXT:    [[VAL:%.*]] = call i32 @Val(i32 5)
-; CLEANUP-NEXT:    ret void
-;
-AllocaSpillBB:
-  %val = call i32 @Val(i32 5)
-  call void @_AmdComplete()
-  store i32 %val, i32* @debug_global, align 4
-  unreachable
-}
-
-!0 = !{i32 6}
-!2 = !{%struct.DispatchSystemData poison}
-!3 = !{%struct.TraversalData poison}
diff --git a/llvmraytracing/test/dx/intrinsics/cont-payload-registers-i32-count.ll b/llvmraytracing/test/dx/intrinsics/cont-payload-registers-i32-count.ll
deleted file mode 100644
index 3731be8012..0000000000
--- a/llvmraytracing/test/dx/intrinsics/cont-payload-registers-i32-count.ll
+++ /dev/null
@@ -1,64 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='dxil-cont-prepare-gpurt-library,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=MINCOUNT %s
-; RUN: opt --verify-each -passes='dxil-cont-prepare-gpurt-library,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE-MINCOUNT %s
-
-%struct.DispatchSystemData = type { i32 }
-
-@debug_global = external global i32
-
-declare i32 @_AmdContPayloadRegistersI32Count()
-%struct.TraversalData = type { i32 }
-
-declare !pointeetys !9 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
-declare !pointeetys !9 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
-declare !pointeetys !12 i1 @_cont_ReportHit(%struct.TraversalData* %data, float %t, i32 %hitKind)
-
-define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !pointeetys !{%struct.DispatchSystemData poison} {
-  ret void
-}
-
-define void @main() {
-; MINCOUNT-LABEL: define void @main(
-; MINCOUNT-SAME: i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !continuation [[META10:![0-9]+]] !lgc.rt.shaderstage [[META5:![0-9]+]] !continuation.entry [[META11:![0-9]+]] {
-; MINCOUNT-NEXT:  entry:
-; MINCOUNT-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; MINCOUNT-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; MINCOUNT-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
-; MINCOUNT-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
-; MINCOUNT-NEXT:    store i32 11, ptr @debug_global, align 4
-; MINCOUNT-NEXT:    ret void
-;
-; LOWERRAYTRACINGPIPELINE-MINCOUNT-LABEL: define void @main(
-; LOWERRAYTRACINGPIPELINE-MINCOUNT-SAME: i32 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !continuation [[META10:![0-9]+]] !lgc.rt.shaderstage [[META5:![0-9]+]] !continuation.entry [[META11:![0-9]+]] !continuation.registercount [[META5]] {
-; LOWERRAYTRACINGPIPELINE-MINCOUNT-NEXT:  entry:
-; LOWERRAYTRACINGPIPELINE-MINCOUNT-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
-; LOWERRAYTRACINGPIPELINE-MINCOUNT-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [0 x i32], align 4
-; LOWERRAYTRACINGPIPELINE-MINCOUNT-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-MINCOUNT-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
-; LOWERRAYTRACINGPIPELINE-MINCOUNT-NEXT:    store i32 11, ptr @debug_global, align 4
-; LOWERRAYTRACINGPIPELINE-MINCOUNT-NEXT:    call void @lgc.cps.complete()
-; LOWERRAYTRACINGPIPELINE-MINCOUNT-NEXT:    unreachable
-;
-entry:
-  %val = call i32 @_AmdContPayloadRegistersI32Count()
-  store i32 %val, i32* @debug_global, align 4
-  ret void
-}
-
-!dx.entryPoints = !{!0, !3}
-!continuation.maxPayloadRegisterCount = !{!7}
-!continuation.maxUsedPayloadRegisterCount = !{!8}
-
-!0 = !{null, !"", null, !1, !6}
-!1 = !{!2, null, null, null}
-!2 = !{!3}
-!3 = !{void ()* @main, !"main", null, null, !4}
-!4 = !{i32 8, i32 7, i32 6, i32 16, i32 7, i32 8, i32 5, !5}
-!5 = !{i32 0}
-!6 = !{i32 0, i64 65536}
-!7 = !{i32 15}
-!8 = !{i32 11}
-!9 = !{%struct.DispatchSystemData poison}
-!10 = !{i32 0, %struct.DispatchSystemData poison}
-!11 = !{i32 0, %struct.TraversalData poison}
-!12 = !{%struct.TraversalData poison}
diff --git a/llvmraytracing/test/dx/intrinsics/cont-stack-access.ll b/llvmraytracing/test/dx/intrinsics/cont-stack-access.ll
deleted file mode 100644
index 61a01913e4..0000000000
--- a/llvmraytracing/test/dx/intrinsics/cont-stack-access.ll
+++ /dev/null
@@ -1,98 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: grep -v SKIP_GLOBAL_ADDRSPACE %s | opt --verify-each -passes='cleanup-continuations,lint' -S --lint-abort-on-error | FileCheck %s -check-prefix=STACK_SCRATCH
-; RUN: grep -v SKIP_SCRATCH_ADDRSPACE %s | opt --verify-each -passes='cleanup-continuations,lint' -S --lint-abort-on-error | FileCheck %s -check-prefix=STACK_GLOBAL
-
-declare i32 @_AmdContStackAlloc(i32 %size)
-declare i32 @_AmdContStackLoadI32(i32 %addr)
-declare i32 @_AmdContStackLoadLastUseI32(i32 %addr)
-declare i32 @_AmdContStackStoreI32(i32 %addr, i32 %val)
-declare i32 @_AmdContStackFree(i32 %size)
-declare i32 @_cont_GetContinuationStackAddr() #0
-declare i64 @_cont_GetContinuationStackGlobalMemBase() ; SKIP_GLOBAL_ADDRSPACE
-
-%struct.DispatchSystemData = type { i32 }
-%struct.type = type { <2 x float> }
-
-%struct.Payload = type { [8 x i32] }
-
-@debug_global = external global i32
-
-declare void @lgc.cps.complete()
-
-define void @main(%struct.type %cont.state, i32 %return.addr, i32 %shader.index, %struct.DispatchSystemData %system.data) !lgc.rt.shaderstage !14 !lgc.cps !15 !continuation !{ptr @main} {
-; STACK_SCRATCH-LABEL: define void @main(
-; STACK_SCRATCH-SAME: i32 [[CSPINIT:%.*]], [[STRUCT_TYPE:%.*]] [[CONT_STATE:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]]) !lgc.rt.shaderstage [[META5:![0-9]+]] !lgc.cps [[META8:![0-9]+]] !continuation [[META9:![0-9]+]] !continuation.stacksize [[META10:![0-9]+]] !continuation.state [[META5]] {
-; STACK_SCRATCH-NEXT:  entry:
-; STACK_SCRATCH-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; STACK_SCRATCH-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; STACK_SCRATCH-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CSP]], align 4
-; STACK_SCRATCH-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 12
-; STACK_SCRATCH-NEXT:    store i32 [[TMP1]], ptr [[CSP]], align 4
-; STACK_SCRATCH-NEXT:    [[PTR_FINAL:%.*]] = add i32 [[TMP0]], 4
-; STACK_SCRATCH-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[PTR_FINAL]] to ptr addrspace(21)
-; STACK_SCRATCH-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP2]], i32 0
-; STACK_SCRATCH-NEXT:    store i32 10, ptr addrspace(21) [[TMP3]], align 4
-; STACK_SCRATCH-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[PTR_FINAL]] to ptr addrspace(21)
-; STACK_SCRATCH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP4]], i32 0
-; STACK_SCRATCH-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(21) [[TMP5]], align 4
-; STACK_SCRATCH-NEXT:    [[TMP7:%.*]] = load i32, ptr [[CSP]], align 4
-; STACK_SCRATCH-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], -12
-; STACK_SCRATCH-NEXT:    store i32 [[TMP8]], ptr [[CSP]], align 4
-; STACK_SCRATCH-NEXT:    store i32 [[TMP6]], ptr @debug_global, align 4
-; STACK_SCRATCH-NEXT:    ret void
-;
-; STACK_GLOBAL-LABEL: define void @main(
-; STACK_GLOBAL-SAME: i32 [[CSPINIT:%.*]], [[STRUCT_TYPE:%.*]] [[CONT_STATE:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]]) !lgc.rt.shaderstage [[META5:![0-9]+]] !lgc.cps [[META8:![0-9]+]] !continuation [[META9:![0-9]+]] !continuation.stacksize [[META10:![0-9]+]] !continuation.state [[META5]] {
-; STACK_GLOBAL-NEXT:  entry:
-; STACK_GLOBAL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; STACK_GLOBAL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; STACK_GLOBAL-NEXT:    [[TMP0:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase()
-; STACK_GLOBAL-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(22)
-; STACK_GLOBAL-NEXT:    [[TMP2:%.*]] = load i32, ptr [[CSP]], align 4
-; STACK_GLOBAL-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 12
-; STACK_GLOBAL-NEXT:    store i32 [[TMP3]], ptr [[CSP]], align 4
-; STACK_GLOBAL-NEXT:    [[PTR_FINAL:%.*]] = add i32 [[TMP2]], 4
-; STACK_GLOBAL-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[PTR_FINAL]]
-; STACK_GLOBAL-NEXT:    store i32 10, ptr addrspace(22) [[TMP4]], align 4
-; STACK_GLOBAL-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[PTR_FINAL]]
-; STACK_GLOBAL-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(22) [[TMP5]], align 4
-; STACK_GLOBAL-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[PTR_FINAL]]
-; STACK_GLOBAL-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(22) [[TMP7]], align 4, !amdgpu.last.use [[META11:![0-9]+]]
-; STACK_GLOBAL-NEXT:    [[TMP9:%.*]] = load i32, ptr [[CSP]], align 4
-; STACK_GLOBAL-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], -12
-; STACK_GLOBAL-NEXT:    store i32 [[TMP10]], ptr [[CSP]], align 4
-; STACK_GLOBAL-NEXT:    store i32 [[TMP6]], ptr @debug_global, align 4
-; STACK_GLOBAL-NEXT:    ret void
-;
-entry:
-  %ptr = call i32 @_AmdContStackAlloc(i32 12)
-  %ptr.final = add i32 %ptr, 4
-  call void @_AmdContStackStoreI32(i32 %ptr.final, i32 10)
-  %val = call i32 @_AmdContStackLoadI32(i32 %ptr.final)
-  %val.2 = call i32 @_AmdContStackLoadLastUseI32(i32 %ptr.final)  ; SKIP_GLOBAL_ADDRSPACE
-  call void @_AmdContStackFree(i32 12)
-  store i32 %val, ptr @debug_global
-  call void @lgc.cps.complete()
-  unreachable
-}
-
-!dx.entryPoints = !{!1, !5}
-!continuation.stackAddrspace = !{!16} ; SKIP_GLOBAL_ADDRSPACE
-!continuation.stackAddrspace = !{!17} ; SKIP_SCRATCH_ADDRSPACE
-
-!1 = !{null, !"", null, !3, !2}
-!2 = !{i32 0, i64 65536}
-!3 = !{!4, null, null, null}
-!4 = !{!5}
-!5 = !{void ()* @main, !"main", null, null, !6}
-!6 = !{i32 8, i32 7, i32 6, i32 16, i32 7, i32 8, i32 5, !7}
-!7 = !{i32 0}
-!9 = !{i32 8, i32 10, i32 6, i32 16, i32 7, i32 8, i32 5, !7}
-!10 = !{!"function", !"void", !11, !11}
-!11 = !{i32 0, %struct.Payload poison}
-!12 = !{!"function", i32 poison, !13, !14}
-!13 = !{i32 0, %struct.DispatchSystemData poison}
-!14 = !{i32 0}
-!15 =!{i32 1}
-!16 = !{i32 22}
-!17 = !{i32 21}
diff --git a/llvmraytracing/test/dx/intrinsics/cont-stack-alloc.ll b/llvmraytracing/test/dx/intrinsics/cont-stack-alloc.ll
deleted file mode 100644
index 5c16f541ba..0000000000
--- a/llvmraytracing/test/dx/intrinsics/cont-stack-alloc.ll
+++ /dev/null
@@ -1,81 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function main --version 2
-; RUN: opt --verify-each -passes='cgscc(inline),lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck %s
-
-declare i32 @_AmdContStackAlloc(i32 %size)
-declare i32 @_AmdContPayloadRegistersI32Count()
-declare i32 @_cont_GetContinuationStackAddr() #0
-
-%struct.DispatchSystemData = type { i32 }
-%struct.HitData = type { float, i32 }
-%struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
-%struct.TraversalData = type { <3 x float>, <3 x float>, float }
-
-declare !pointeetys !15 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
-declare !pointeetys !15 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
-declare !pointeetys !16 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.DispatchSystemData*)
-declare !pointeetys !12 i32 @_cont_HitKind(%struct.DispatchSystemData*, %struct.HitData*)
-
-declare !pointeetys !21 i1 @_cont_ReportHit(%struct.TraversalData* %data, float %t, i32 %hitKind)
-
-%struct.Payload = type { [8 x i32] }
-
-@debug_global = external global i32
-
-define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !pointeetys !{%struct.DispatchSystemData poison} {
-  ret void
-}
-
-define void @main() !lgc.rt.shaderstage !17 {
-; CHECK-LABEL: define void @main
-; CHECK-SAME: (i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META5:![0-9]+]] !continuation.entry [[META11:![0-9]+]] !continuation [[META12:![0-9]+]] !continuation.stacksize [[META13:![0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
-; CHECK-NEXT:    [[PL_BYTES:%.*]] = mul i32 30, 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 120
-; CHECK-NEXT:    store i32 [[TMP2]], ptr [[CSP]], align 4
-; CHECK-NEXT:    store i32 [[TMP1]], ptr @debug_global, align 4
-; CHECK-NEXT:    ret void
-;
-entry:
-  %pl_size = call i32 @_AmdContPayloadRegistersI32Count()
-  %pl_bytes = mul i32 %pl_size, 4
-  %val = call i32 @_AmdContStackAlloc(i32 %pl_bytes)
-  store i32 %val, ptr @debug_global
-  ret void
-}
-
-; Check for correct stack size
-; CHECK: !{{.*}} = !{i32 120}
-
-; Define hit shader to increase payload size
-define void @chit(%struct.Payload* %pl, %struct.Payload* %attrs) !pointeetys !10 !lgc.rt.shaderstage !18 {
-  ret void
-}
-
-!dx.entryPoints = !{!1, !5, !8}
-!continuation.maxUsedPayloadRegisterCount = !{!19}
-
-!1 = !{null, !"", null, !3, !2}
-!2 = !{i32 0, i64 65536}
-!3 = !{!4, null, null, null}
-!4 = !{!5}
-!5 = !{void ()* @main, !"main", null, null, !6}
-!6 = !{i32 8, i32 7, i32 6, i32 16, i32 7, i32 8, i32 5, !7}
-!7 = !{i32 0}
-!8 = !{void (%struct.Payload*, %struct.Payload*)* @chit, !"chit", null, null, !9}
-!9 = !{i32 8, i32 10, i32 6, i32 16, i32 7, i32 8, i32 5, !7}
-!10 = !{null, %struct.Payload poison, %struct.Payload poison}
-!11 = !{i32 0, %struct.Payload poison}
-!12 = !{null, %struct.DispatchSystemData poison, %struct.HitData poison}
-!13 = !{i32 0, %struct.DispatchSystemData poison}
-!14 = !{i32 0, %struct.HitData poison}
-!15 = !{%struct.DispatchSystemData poison}
-!16 = !{%struct.DispatchSystemData poison}
-!17 = !{i32 0}
-!18 = !{i32 3}
-!19 = !{i32 30}
-!20 = !{i32 0, %struct.TraversalData poison}
-!21 = !{%struct.TraversalData poison}
diff --git a/llvmraytracing/test/dx/intrinsics/continuation-stack-is-global-false.ll b/llvmraytracing/test/dx/intrinsics/continuation-stack-is-global-false.ll
deleted file mode 100644
index 2a5ca044e3..0000000000
--- a/llvmraytracing/test/dx/intrinsics/continuation-stack-is-global-false.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,dxil-cont-prepare-gpurt-library,lint' -S %s --lint-abort-on-error | FileCheck %s
-
-%struct.DispatchSystemData = type { i32 }
-
-@debug_global = external global { i64, i32 }
-
-declare i1 @_AmdContinuationStackIsGlobal()
-
-declare !pointeetys !8 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
-
-define void @main() {
-; CHECK-LABEL: define void @main(
-; CHECK-SAME: ) !lgc.rt.shaderstage [[META5:![0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @lgc.rt.shader.index()
-; CHECK-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[TMP0]])
-; CHECK-NEXT:    store i1 false, ptr @debug_global, align 1
-; CHECK-NEXT:    ret void
-;
-entry:
-  %val = call i1 @_AmdContinuationStackIsGlobal()
-  store i1 %val, ptr @debug_global
-  ret void
-}
-
-!dx.entryPoints = !{!0, !3}
-!continuation.stackAddrspace = !{!7}
-
-!0 = !{null, !"", null, !1, !6}
-!1 = !{!2, null, null, null}
-!2 = !{!3}
-!3 = !{i1 ()* @main, !"main", null, null, !4}
-!4 = !{i32 8, i32 7, i32 6, i32 16, i32 7, i32 8, i32 5, !5}
-!5 = !{i32 0}
-!6 = !{i32 0, i64 65536}
-!7 = !{i32 21}
-!8 = !{%struct.DispatchSystemData poison}
diff --git a/llvmraytracing/test/dx/intrinsics/continuation-stack-is-global-true.ll b/llvmraytracing/test/dx/intrinsics/continuation-stack-is-global-true.ll
deleted file mode 100644
index 0437a15a2f..0000000000
--- a/llvmraytracing/test/dx/intrinsics/continuation-stack-is-global-true.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,dxil-cont-prepare-gpurt-library,lint' -S %s --lint-abort-on-error | FileCheck %s
-
-%struct.DispatchSystemData = type { i32 }
-
-@debug_global = external global i1
-
-declare i1 @_AmdContinuationStackIsGlobal()
-
-declare !pointeetys !8 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
-
-define void @main() {
-; CHECK-LABEL: define void @main(
-; CHECK-SAME: ) !lgc.rt.shaderstage [[META5:![0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @lgc.rt.shader.index()
-; CHECK-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[TMP0]])
-; CHECK-NEXT:    store i1 true, ptr @debug_global, align 1
-; CHECK-NEXT:    ret void
-;
-entry:
-  %val = call i1 @_AmdContinuationStackIsGlobal()
-  store i1 %val, ptr @debug_global
-  ret void
-}
-
-!dx.entryPoints = !{!0, !3}
-!continuation.stackAddrspace = !{!7}
-
-!0 = !{null, !"", null, !1, !6}
-!1 = !{!2, null, null, null}
-!2 = !{!3}
-!3 = !{i1 ()* @main, !"main", null, null, !4}
-!4 = !{i32 8, i32 7, i32 6, i32 16, i32 7, i32 8, i32 5, !5}
-!5 = !{i32 0}
-!6 = !{i32 0, i64 65536}
-!7 = !{i32 22}
-!8 = !{%struct.DispatchSystemData poison}
diff --git a/llvmraytracing/test/dx/intrinsics/get-current-func-addr.ll b/llvmraytracing/test/dx/intrinsics/get-current-func-addr.ll
deleted file mode 100644
index 934eb4b267..0000000000
--- a/llvmraytracing/test/dx/intrinsics/get-current-func-addr.ll
+++ /dev/null
@@ -1,52 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes='lower-raytracing-pipeline,lint' -S --lint-abort-on-error | FileCheck --check-prefix=CHECK %s
-; RUN: opt --verify-each -passes='lower-raytracing-pipeline,lint' -S %s --lint-abort-on-error | FileCheck --check-prefix=CHECK-CPS %s
-
-%struct.DispatchSystemData = type { i32 }
-
-declare void @Use(i32)
-declare i32 @_AmdGetCurrentFuncAddr()
-
-declare !pointeetys !2 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
-declare !pointeetys !2 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
-
-define void @MyRayGen() {
-; CHECK-LABEL: define void @MyRayGen() {
-; CHECK-NEXT:  AllocaSpillBB:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @MyRayGen)
-; CHECK-NEXT:    call void @Use(i32 [[TMP0]])
-; CHECK-NEXT:    ret void
-;
-; CHECK-CPS-LABEL: define void @MyRayGen() {
-; CHECK-CPS-NEXT:  AllocaSpillBB:
-; CHECK-CPS-NEXT:    [[TMP0:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @MyRayGen)
-; CHECK-CPS-NEXT:    call void @Use(i32 [[TMP0]])
-; CHECK-CPS-NEXT:    ret void
-;
-AllocaSpillBB:
-  %val = call i32 @_AmdGetCurrentFuncAddr()
-  call void @Use(i32 %val)
-  ret void
-}
-
-define void @MyRayGen.resume.0() {
-; CHECK-LABEL: define void @MyRayGen.resume.0() {
-; CHECK-NEXT:  entryresume.0:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @MyRayGen.resume.0)
-; CHECK-NEXT:    call void @Use(i32 [[TMP0]])
-; CHECK-NEXT:    ret void
-;
-; CHECK-CPS-LABEL: define void @MyRayGen.resume.0() {
-; CHECK-CPS-NEXT:  entryresume.0:
-; CHECK-CPS-NEXT:    [[TMP0:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @MyRayGen.resume.0)
-; CHECK-CPS-NEXT:    call void @Use(i32 [[TMP0]])
-; CHECK-CPS-NEXT:    ret void
-;
-entryresume.0:
-  %val = call i32 @_AmdGetCurrentFuncAddr()
-  call void @Use(i32 %val)
-  ret void
-}
-
-!lgc.cps.module = !{}
-!2 = !{%struct.DispatchSystemData poison}
diff --git a/llvmraytracing/test/dx/intrinsics/get-rtip.ll b/llvmraytracing/test/dx/intrinsics/get-rtip.ll
deleted file mode 100644
index 34fc841570..0000000000
--- a/llvmraytracing/test/dx/intrinsics/get-rtip.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='dxil-cont-prepare-gpurt-library,lint' -S %s --lint-abort-on-error | FileCheck %s
-
-declare i32 @_AmdGetRtip()
-
-%struct.DispatchSystemData = type { i32 }
-declare !pointeetys !8 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
-
-@debug_global = external global i32
-
-define void @main() !lgc.rt.shaderstage !1 {
-; CHECK-LABEL: define void @main(
-; CHECK-SAME: ) !lgc.rt.shaderstage [[META2:![0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store i32 2, ptr @debug_global, align 4
-; CHECK-NEXT:    ret void
-;
-entry:
-  %val = call i32 @_AmdGetRtip()
-  store i32 %val, ptr @debug_global
-  ret void
-}
-
-!continuation.rtip = !{!0}
-
-!0 = !{i32 2}
-!1 = !{i32 0}
-!8 = !{%struct.DispatchSystemData poison}
diff --git a/llvmraytracing/test/dx/intrinsics/get-setting.ll b/llvmraytracing/test/dx/intrinsics/get-setting.ll
deleted file mode 100644
index 8cd86f5e3e..0000000000
--- a/llvmraytracing/test/dx/intrinsics/get-setting.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='dxil-cont-prepare-gpurt-library,lint' -S %s --lint-abort-on-error | FileCheck %s
-
-declare i32 @_AmdGetSetting_123()
-
-@debug_global = external global i32
-
-define void @main() !lgc.rt.shaderstage !1 {
-; CHECK-LABEL: define void @main(
-; CHECK-SAME: ) !lgc.rt.shaderstage [[META1:![0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store i32 42, ptr @debug_global, align 4
-; CHECK-NEXT:    ret void
-;
-entry:
-  %val = call i32 @_AmdGetSetting_123()
-  store i32 %val, ptr @debug_global
-  ret void
-}
-
-!gpurt.settings = !{!2}
-
-!0 = !{i32 3}
-!1 = !{i32 0}
-!2 = !{i64 123, i64 42}
diff --git a/llvmraytracing/test/dx/intrinsics/get-shader-kind.ll b/llvmraytracing/test/dx/intrinsics/get-shader-kind.ll
deleted file mode 100644
index a6f086fec1..0000000000
--- a/llvmraytracing/test/dx/intrinsics/get-shader-kind.ll
+++ /dev/null
@@ -1,76 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint' -S %s --lint-abort-on-error | FileCheck %s
-
-%struct.DispatchSystemData = type { i32 }
-%struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
-%struct.HitData = type { float, i32 }
-%struct.Payload = type { i32 }
-%struct.TraversalData = type { i32 }
-declare i32 @_AmdGetShaderKind()
-
-declare !pointeetys !3 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
-declare !pointeetys !3 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
-declare !pointeetys !5 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.DispatchSystemData*)
-declare !pointeetys !6 i32 @_cont_HitKind(%struct.DispatchSystemData*, %struct.HitData*)
-
-declare !pointeetys !18 i1 @_cont_ReportHit(%struct.TraversalData* %data, float %t, i32 %hitKind)
-
-; Check that GetShaderKind calls in non-shaders, like left-over intrinsics, are ignored.
-define float @_cont_RayTCurrent() {
-; CHECK-LABEL: define float @_cont_RayTCurrent() {
-; CHECK-NEXT:    [[K:%.*]] = call i32 @_AmdGetShaderKind()
-; CHECK-NEXT:    [[F:%.*]] = sitofp i32 [[K]] to float
-; CHECK-NEXT:    ret float [[F]]
-;
-  %k = call i32 @_AmdGetShaderKind()
-  %f = sitofp i32 %k to float
-  ret float %f
-}
-
-; Note: DXILShaderKind::Miss has value 11
-define void @MyMiss(%struct.Payload* %payload) !pointeetys !1 !lgc.rt.shaderstage !16 {
-; CHECK-LABEL: define void @MyMiss
-; CHECK-SAME: (i32 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META9:![0-9]+]] !continuation.registercount [[META5:![0-9]+]] !continuation [[META10:![0-9]+]] {
-; CHECK-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
-; CHECK-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [7 x i32], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_PAYLOAD:%.*]], align 8
-; CHECK-NEXT:    store [1 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @_cont_GetLocalRootIndex(ptr [[SYSTEM_DATA_ALLOCA]])
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_PAYLOAD]], ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-NEXT:    store i32 [[TMP5]], ptr [[TMP4]], align 4
-; CHECK-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[TMP3]])
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_PAYLOAD]], ptr [[TMP2]], i32 0, i32 0
-; CHECK-NEXT:    store i32 11, ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_PAYLOAD]], ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
-; CHECK-NEXT:    store i32 [[TMP8]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP9]], [8 x i32] poison, [1 x i32] [[TMP10]]), !continuation.registercount [[META5]]
-; CHECK-NEXT:    unreachable
-;
-  %1 = call i32 @_AmdGetShaderKind()
-  %2 = getelementptr inbounds %struct.Payload, %struct.Payload* %payload, i32 0, i32 0
-  store i32 %1, i32* %2, align 4
-  ret void
-}
-
-!dx.entryPoints = !{!12, !13}
-
-!1 = !{%struct.Payload poison}
-!2 = !{i32 0, %struct.Payload poison}
-!3 = !{%struct.DispatchSystemData poison}
-!4 = !{i32 0, %struct.DispatchSystemData poison}
-!5 = !{%struct.DispatchSystemData poison}
-!6 = !{null, %struct.DispatchSystemData poison, %struct.HitData poison}
-!7 = !{i32 0, %struct.HitData poison}
-!12 = !{null, !"", null, null, null}
-!13 = !{void (%struct.Payload*)* @MyMiss, !"MyMiss", null, null, !14}
-; The metadata on this line identifies @MyMiss as miss shader
-!14 = !{i32 8, i32 11, i32 6, i32 4, i32 5, !15}
-!15 = !{i32 0}
-!16 = !{i32 4}
-!17 = !{i32 0, %struct.TraversalData poison}
-!18 = !{%struct.TraversalData poison}
diff --git a/llvmraytracing/test/dx/intrinsics/get-shader-rec-idx.ll b/llvmraytracing/test/dx/intrinsics/get-shader-rec-idx.ll
deleted file mode 100644
index 6fd4f887e5..0000000000
--- a/llvmraytracing/test/dx/intrinsics/get-shader-rec-idx.ll
+++ /dev/null
@@ -1,95 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes='dxil-cont-prepare-gpurt-library,lint,lower-raytracing-pipeline,lint' -S --lint-abort-on-error | FileCheck --check-prefix=CHECK-NON-CPS %s
-; RUN: opt --verify-each -passes='dxil-cont-prepare-gpurt-library,lint,lower-raytracing-pipeline,lint' -S %s --lint-abort-on-error | FileCheck --check-prefix=CHECK-CPS %s
-
-%struct.DispatchSystemData = type { i32 }
-%struct.MyParams = type { i32 }
-%struct.AnyHitTraversalData = type { i64 }
-
-declare void @Use(i32)
-declare i32 @_AmdGetShaderRecordIndex()
-declare !pointeetys !2 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
-declare !pointeetys !2 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
-declare !pointeetys !5 i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind)
-
-define void @MyRayGen() !lgc.rt.shaderstage !1 {
-; CHECK-NON-CPS-LABEL: define void @MyRayGen(
-; CHECK-NON-CPS-SAME: i32 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META4:![0-9]+]] !continuation [[META5:![0-9]+]] !continuation.entry [[META6:![0-9]+]] !continuation.registercount [[META4]] {
-; CHECK-NON-CPS-NEXT:  AllocaSpillBB:
-; CHECK-NON-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
-; CHECK-NON-CPS-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [0 x i32], align 4
-; CHECK-NON-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-NON-CPS-NEXT:    call void @Use(i32 0)
-; CHECK-NON-CPS-NEXT:    call void @lgc.cps.complete()
-; CHECK-NON-CPS-NEXT:    unreachable
-;
-; CHECK-CPS-LABEL: define void @MyRayGen(
-; CHECK-CPS-SAME: i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META4:![0-9]+]] !continuation [[META5:![0-9]+]] !lgc.cps [[META1:![0-9]+]] !continuation.registercount [[META4]] {
-; CHECK-CPS-NEXT:  AllocaSpillBB:
-; CHECK-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
-; CHECK-CPS-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [0 x i32], align 4
-; CHECK-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-CPS-NEXT:    call void @Use(i32 0)
-; CHECK-CPS-NEXT:    call void @lgc.cps.complete()
-; CHECK-CPS-NEXT:    unreachable
-;
-AllocaSpillBB:
-  %idx = call i32 @_AmdGetShaderRecordIndex()
-  call void @Use(i32 %idx)
-  ret void
-}
-
-define void @called(%struct.MyParams* %params) !pointeetys !4 !lgc.rt.shaderstage !3 {
-; CHECK-NON-CPS-LABEL: define void @called(
-; CHECK-NON-CPS-SAME: i32 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [9 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META7:![0-9]+]] !continuation [[META8:![0-9]+]] !continuation.registercount [[META1:![0-9]+]] {
-; CHECK-NON-CPS-NEXT:  AllocaSpillBB:
-; CHECK-NON-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
-; CHECK-NON-CPS-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [1 x i32], align 4
-; CHECK-NON-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_MYPARAMS:%.*]], align 8
-; CHECK-NON-CPS-NEXT:    store [1 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-NON-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-NON-CPS-NEXT:    [[TMP2:%.*]] = call i32 @_cont_GetLocalRootIndex(ptr [[SYSTEM_DATA_ALLOCA]])
-; CHECK-NON-CPS-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP1]], i32 0
-; CHECK-NON-CPS-NEXT:    [[TMP4:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-NON-CPS-NEXT:    store i32 [[TMP4]], ptr [[TMP3]], align 4
-; CHECK-NON-CPS-NEXT:    call void @Use(i32 [[TMP2]])
-; CHECK-NON-CPS-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP1]], i32 0
-; CHECK-NON-CPS-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
-; CHECK-NON-CPS-NEXT:    store i32 [[TMP6]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-NON-CPS-NEXT:    [[TMP7:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-NON-CPS-NEXT:    [[TMP8:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-NON-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP7]], [9 x i32] poison, [1 x i32] [[TMP8]]), !continuation.registercount [[META1]]
-; CHECK-NON-CPS-NEXT:    unreachable
-;
-; CHECK-CPS-LABEL: define void @called(
-; CHECK-CPS-SAME: i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [9 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META6:![0-9]+]] !continuation [[META7:![0-9]+]] !lgc.cps [[META8:![0-9]+]] !continuation.registercount [[META1]] {
-; CHECK-CPS-NEXT:  AllocaSpillBB:
-; CHECK-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
-; CHECK-CPS-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [1 x i32], align 4
-; CHECK-CPS-NEXT:    [[TMP0:%.*]] = alloca [[STRUCT_MYPARAMS:%.*]], align 8
-; CHECK-CPS-NEXT:    store [1 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[SYSTEM_DATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-CPS-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP0]], i32 0
-; CHECK-CPS-NEXT:    [[TMP2:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-CPS-NEXT:    store i32 [[TMP2]], ptr [[TMP1]], align 4
-; CHECK-CPS-NEXT:    call void @Use(i32 [[SHADER_INDEX]])
-; CHECK-CPS-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP0]], i32 0
-; CHECK-CPS-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
-; CHECK-CPS-NEXT:    store i32 [[TMP4]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-CPS-NEXT:    [[TMP5:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-CPS-NEXT:    [[TMP6:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, i32 poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP5]], [9 x i32] poison, [1 x i32] [[TMP6]]), !continuation.registercount [[META1]]
-; CHECK-CPS-NEXT:    unreachable
-;
-AllocaSpillBB:
-  %idx = call i32 @_AmdGetShaderRecordIndex()
-  call void @Use(i32 %idx)
-  ret void
-}
-
-!lgc.cps.module = !{}
-!1 = !{i32 0}
-!2 = !{%struct.DispatchSystemData poison}
-!3 = !{i32 5}
-!4 = !{%struct.MyParams poison}
-!5 = !{%struct.AnyHitTraversalData poison}
diff --git a/llvmraytracing/test/dx/intrinsics/is-llpc.ll b/llvmraytracing/test/dx/intrinsics/is-llpc.ll
deleted file mode 100644
index 7a5c0e808b..0000000000
--- a/llvmraytracing/test/dx/intrinsics/is-llpc.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='dxil-cont-prepare-gpurt-library,lint' -S %s --lint-abort-on-error | FileCheck %s
-
-declare i1 @_AmdIsLlpc()
-
-%struct.DispatchSystemData = type { i32 }
-declare !pointeetys !8 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
-
-@debug_global = external global i32
-
-define void @main() !lgc.rt.shaderstage !1 {
-; CHECK-LABEL: define void @main(
-; CHECK-SAME: ) !lgc.rt.shaderstage [[META1:![0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store i1 false, ptr @debug_global, align 1
-; CHECK-NEXT:    ret void
-;
-entry:
-  %val = call i1 @_AmdIsLlpc()
-  store i1 %val, ptr @debug_global
-  ret void
-}
-
-!0 = !{i32 2}
-!1 = !{i32 0}
-!8 = !{%struct.DispatchSystemData poison}
diff --git a/llvmraytracing/test/dx/intrinsics/shader-index.ll b/llvmraytracing/test/dx/intrinsics/shader-index.ll
deleted file mode 100644
index dc686d69d3..0000000000
--- a/llvmraytracing/test/dx/intrinsics/shader-index.ll
+++ /dev/null
@@ -1,89 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint" -S %s --lint-abort-on-error | FileCheck %s
-
-%struct.DispatchSystemData = type { i32 }
-%struct.Payload = type { i32 }
-
-@debug_global = external global i32
-
-declare i32 @lgc.rt.shader.index()
-
-declare !pointeetys !8 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
-
-declare !pointeetys !8 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
-
-define i1 @_cont_ReportHit(%struct.DispatchSystemData* %data, float %t, i32 %hitKind) #0 !pointeetys !20 {
-  ret i1 true
-}
-
-define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !pointeetys !{%struct.DispatchSystemData poison} {
-  ret void
-}
-
-define void @main() !lgc.rt.shaderstage !24 {
-; CHECK-LABEL: define void @main(
-; CHECK-SAME: i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META12:![0-9]+]] !lgc.cps [[META10:![0-9]+]] !continuation.registercount [[META12]] !continuation [[META13:![0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
-; CHECK-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [0 x i32], align 4
-; CHECK-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
-; CHECK-NEXT:    store i32 0, ptr @debug_global, align 4
-; CHECK-NEXT:    call void @lgc.cps.complete()
-; CHECK-NEXT:    unreachable
-;
-entry:
-  %val = call i32 @lgc.rt.shader.index()
-  store i32 %val, ptr @debug_global
-  ret void
-}
-
-define void @callable(%struct.Payload* %payload) !pointeetys !22 !lgc.rt.shaderstage !25 {
-; CHECK-LABEL: define void @callable(
-; CHECK-SAME: i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META14:![0-9]+]] !lgc.cps [[META15:![0-9]+]] !continuation.registercount [[META10]] !continuation [[META16:![0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
-; CHECK-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [1 x i32], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = alloca [[STRUCT_PAYLOAD:%.*]], align 8
-; CHECK-NEXT:    store [1 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[SYSTEM_DATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_PAYLOAD]], ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-NEXT:    store i32 [[TMP2]], ptr [[TMP1]], align 4
-; CHECK-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[SHADER_INDEX]])
-; CHECK-NEXT:    store i32 [[SHADER_INDEX]], ptr @debug_global, align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_PAYLOAD]], ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
-; CHECK-NEXT:    store i32 [[TMP4]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, i32 poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP5]], [8 x i32] poison, [1 x i32] [[TMP6]]), !continuation.registercount [[META10]]
-; CHECK-NEXT:    unreachable
-;
-entry:
-  %val = call i32 @lgc.rt.shader.index()
-  store i32 %val, ptr @debug_global
-  ret void
-}
-
-!dx.entryPoints = !{!0, !3, !10}
-!continuation.stackAddrspace = !{!7}
-!lgc.cps.module = !{}
-
-!0 = !{null, !"", null, !1, !6}
-!1 = !{!2, null, null, null}
-!2 = !{!3}
-!3 = !{i1 ()* @main, !"main", null, null, !4}
-!4 = !{i32 8, i32 7}
-!6 = !{i32 0, i64 65536}
-!7 = !{i32 21}
-!8 = !{%struct.DispatchSystemData poison}
-!9 = !{i32 0, %struct.DispatchSystemData poison}
-!10 = !{i1 ()* @callable, !"callable", null, null, !11}
-!11 = !{i32 8, i32 12}
-!20 = !{%struct.DispatchSystemData poison}
-!21 = !{i32 0, %struct.DispatchSystemData poison}
-!22 = !{%struct.Payload poison}
-!23 = !{i32 0, %struct.Payload poison}
-!24 = !{i32 0}
-!25 = !{i32 5}
diff --git a/llvmraytracing/test/dx/intrinsics/value-i32.ll b/llvmraytracing/test/dx/intrinsics/value-i32.ll
deleted file mode 100644
index 500aadb368..0000000000
--- a/llvmraytracing/test/dx/intrinsics/value-i32.ll
+++ /dev/null
@@ -1,46 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt --verify-each -passes='cleanup-continuations,lint' -S %s --lint-abort-on-error | FileCheck %s
-
-%struct.Payload = type { float, i32, i64, i32 }
-
-declare !pointeetys !0 i32 @_AmdValueI32Count(%struct.Payload*)
-
-declare !pointeetys !0 i32 @_AmdValueGetI32(%struct.Payload*, i32)
-
-declare !pointeetys !0 void @_AmdValueSetI32(%struct.Payload*, i32, i32)
-
-define i32 @count(%struct.Payload* %pl) !pointeetys !0 {
-; CHECK-LABEL: define i32 @count
-; CHECK-SAME: (ptr [[PL:%.*]]) !pointeetys [[META1:![0-9]+]] {
-; CHECK-NEXT:    ret i32 5
-;
-  %val = call i32 @_AmdValueI32Count(%struct.Payload* %pl)
-  ret i32 %val
-}
-
-define i32 @get(%struct.Payload* %pl) !pointeetys !0 {
-; CHECK-LABEL: define i32 @get
-; CHECK-SAME: (ptr [[PL:%.*]]) !pointeetys [[META1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PL]], i32 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
-; CHECK-NEXT:    ret i32 [[TMP2]]
-;
-  %val = call i32 @_AmdValueGetI32(%struct.Payload* %pl, i32 2)
-  ret i32 %val
-}
-
-define void @set(%struct.Payload* %pl, i32 %val) !pointeetys !0 {
-; CHECK-LABEL: define void @set
-; CHECK-SAME: (ptr [[PL:%.*]], i32 [[VAL:%.*]]) !pointeetys [[META1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PL]], i32 2
-; CHECK-NEXT:    store i32 [[VAL]], ptr [[TMP1]], align 4
-; CHECK-NEXT:    ret void
-;
-  call void @_AmdValueSetI32(%struct.Payload* %pl, i32 2, i32 %val)
-  ret void
-}
-
-!continuation.stackAddrspace = !{!5}
-
-!0 = !{%struct.Payload poison}
-!5 = !{i32 21}
diff --git a/llvmraytracing/test/dx/lint/multiple-setlocalrootindex-pre-coro.ll b/llvmraytracing/test/dx/lint/multiple-setlocalrootindex-pre-coro.ll
deleted file mode 100644
index 82dce60286..0000000000
--- a/llvmraytracing/test/dx/lint/multiple-setlocalrootindex-pre-coro.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; NOTE: Do not autogenerate
-; RUN: opt --verify-each -passes='continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error 2>&1 | FileCheck %s
-
-; CHECK-NOT: Found a function with more than one call to setLocalRootIndex
-; CHECK-LABEL: define void @RayGen(
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-%struct.DispatchSystemData = type { i32 }
-
-declare void @amd.dx.setLocalRootIndex(i32)
-declare void @lgc.cps.await__void(...)
-
-define void @RayGen(i32 %dummyRetAddr, %struct.DispatchSystemData %0) !lgc.rt.shaderstage !0 !continuation.entry !1 !continuation !2 {
-  call void @amd.dx.setLocalRootIndex(i32 0)
-  call void (...) @lgc.cps.await__void(i32 2, i32 3)
-  call void @amd.dx.setLocalRootIndex(i32 5)
-  ret void
-}
-
-!continuation.stackAddrspace = !{!3}
-
-!0 = !{i32 0}
-!1 = !{}
-!2 = !{void ()* @RayGen}
-!3 = !{i32 21}
diff --git a/llvmraytracing/test/dx/lint/multiple-setlocalrootindex.ll b/llvmraytracing/test/dx/lint/multiple-setlocalrootindex.ll
deleted file mode 100644
index 6cbbadc302..0000000000
--- a/llvmraytracing/test/dx/lint/multiple-setlocalrootindex.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; NOTE: Do not autogenerate
-; RUN: not opt --verify-each -passes='continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error 2>&1 | FileCheck %s
-
-; CHECK: Found a function with more than one call to setLocalRootIndex
-; CHECK-NEXT: ptr @RayGen
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-%struct.DispatchSystemData = type { i32 }
-
-declare void @amd.dx.setLocalRootIndex(i32)
-
-define void @RayGen(i32 %dummyRetAddr, %struct.DispatchSystemData %0) !lgc.rt.shaderstage !0 !continuation.entry !1 !continuation !2 {
-  call void @amd.dx.setLocalRootIndex(i32 0)
-  call void @amd.dx.setLocalRootIndex(i32 5)
-  ret void
-}
-
-!continuation.stackAddrspace = !{!3}
-
-!0 = !{i32 0}
-!1 = !{}
-!2 = !{void ()* @RayGen}
-!3 = !{i32 21}
diff --git a/llvmraytracing/test/dx/lint/undef-jump-target.ll b/llvmraytracing/test/dx/lint/undef-jump-target.ll
deleted file mode 100644
index e54d7afb31..0000000000
--- a/llvmraytracing/test/dx/lint/undef-jump-target.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; NOTE: Do not autogenerate
-; RUN: not opt --verify-each -passes='continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error 2>&1 | FileCheck %s
-
-; CHECK: Jump has undefined jump target
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-%struct.DispatchSystemData = type { i32 }
-
-declare void @lgc.cps.jump(...)
-
-define void @RayGen(i32 %dummyRetAddr, %struct.DispatchSystemData %0) !lgc.rt.shaderstage !0 !continuation.entry !1 !continuation !2 {
-  call void (...) @lgc.cps.jump(i32 undef), !continuation.registercount !0
-  unreachable
-}
-
-!continuation.stackAddrspace = !{!3}
-
-!0 = !{i32 0}
-!1 = !{}
-!2 = !{void ()* @RayGen}
-!3 = !{i32 21}
diff --git a/llvmraytracing/test/dx/lower-await.ll b/llvmraytracing/test/dx/lower-await.ll
deleted file mode 100644
index 95ec1860c0..0000000000
--- a/llvmraytracing/test/dx/lower-await.ll
+++ /dev/null
@@ -1,257 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='lower-await,lint' -S %s --lint-abort-on-error | FileCheck -check-prefix=AWAIT %s
-; RUN: opt --verify-each -passes='lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint' -S %s --lint-abort-on-error | FileCheck -check-prefix=CORO %s
-; RUN: opt --verify-each -passes='lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,cleanup-continuations,lint' -S %s --lint-abort-on-error | FileCheck -check-prefix=CLEANED %s
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-declare ptr @async_fun(i32, i32)
-declare ptr @async_fun_with_waitmask(i32, i32)
-declare ptr @async_fun_with_arg(i32, i32, i32)
-declare void @lgc.cps.await__void(...)
-declare { i32 } @lgc.cps.await__i32(...)
-declare void @lgc.cps.jump(...)
-declare void @lgc.cps.complete()
-
-define void @simple_await(i32 %dummyRetAddr) !continuation.registercount !1 {
-; AWAIT-LABEL: define { ptr, ptr } @simple_await(
-; AWAIT-SAME: i32 [[DUMMYRETADDR:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] {
-; AWAIT-NEXT:    [[TMP2:%.*]] = call token @llvm.coro.id.retcon(i32 8, i32 4, ptr [[TMP0]], ptr @continuation.prototype.simple_await, ptr @continuation.malloc, ptr @continuation.free)
-; AWAIT-NEXT:    [[TMP3:%.*]] = call ptr @llvm.coro.begin(token [[TMP2]], ptr null)
-; AWAIT-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i32
-; AWAIT-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[CALLEE]] to ptr
-; AWAIT-NEXT:    [[TMP5:%.*]] = call ptr [[TMP4]](i32 [[CALLEE]], i32 2), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
-; AWAIT-NEXT:    [[TMP6:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TMP5]])
-; AWAIT-NEXT:    call void (...) @lgc.cps.jump(i32 [[DUMMYRETADDR]], i32 -1, i32 poison, i32 poison), !continuation.registercount [[META1]]
-; AWAIT-NEXT:    unreachable
-;
-; CORO-LABEL: define { ptr, ptr } @simple_await(
-; CORO-SAME: i32 [[DUMMYRETADDR:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] {
-; CORO-NEXT:  AllocaSpillBB:
-; CORO-NEXT:    [[DUMMYRETADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME:%.*]], ptr [[TMP0]], i32 0, i32 0
-; CORO-NEXT:    store i32 [[DUMMYRETADDR]], ptr [[DUMMYRETADDR_SPILL_ADDR]], align 4
-; CORO-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i32
-; CORO-NEXT:    [[TMP1:%.*]] = inttoptr i32 [[CALLEE]] to ptr
-; CORO-NEXT:    [[TMP2:%.*]] = call ptr [[TMP1]](i32 [[CALLEE]], i32 2), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
-; CORO-NEXT:    [[TMP3:%.*]] = insertvalue { ptr, ptr } poison, ptr @simple_await.resume.0, 0
-; CORO-NEXT:    [[TMP4:%.*]] = insertvalue { ptr, ptr } [[TMP3]], ptr [[TMP2]], 1
-; CORO-NEXT:    ret { ptr, ptr } [[TMP4]]
-;
-; CLEANED-LABEL: define void @simple_await(
-; CLEANED-SAME: i32 [[CSPINIT:%.*]], i32 [[DUMMYRETADDR:%.*]]) !continuation.registercount [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] !continuation.stacksize [[META3:![0-9]+]] !continuation.state [[META3]] {
-; CLEANED-NEXT:  AllocaSpillBB:
-; CLEANED-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CLEANED-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; CLEANED-NEXT:    [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4
-; CLEANED-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 8
-; CLEANED-NEXT:    store i32 [[TMP5]], ptr [[CSP]], align 4
-; CLEANED-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(21)
-; CLEANED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP2]], i32 0
-; CLEANED-NEXT:    store i32 [[DUMMYRETADDR]], ptr addrspace(21) [[TMP3]], align 4
-; CLEANED-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i32
-; CLEANED-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[CALLEE]] to ptr
-; CLEANED-NEXT:    [[TMP1:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @simple_await.resume.0)
-; CLEANED-NEXT:    [[TMP6:%.*]] = load i32, ptr [[CSP]], align 4
-; CLEANED-NEXT:    call void (...) @lgc.cps.jump(i32 [[CALLEE]], i32 -1, i32 [[TMP6]], i32 [[TMP1]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
-; CLEANED-NEXT:    unreachable
-;
-  %callee = ptrtoint ptr @async_fun to i32
-  call void (...) @lgc.cps.await__void(i32 %callee, i32 2), !continuation.registercount !1, !continuation.returnedRegistercount !1
-  call void (...) @lgc.cps.jump(i32 %dummyRetAddr, i32 -1, i32 poison, i32 poison), !continuation.registercount !1
-  unreachable
-}
-
-define void @simple_await_entry() !continuation.entry !0 !continuation.registercount !1 {
-; AWAIT-LABEL: define { ptr, ptr } @simple_await_entry(
-; AWAIT-SAME: ptr [[TMP0:%.*]]) !continuation.registercount [[META1]] !continuation.entry [[META3:![0-9]+]] !continuation [[META4:![0-9]+]] {
-; AWAIT-NEXT:    [[TMP2:%.*]] = call token @llvm.coro.id.retcon(i32 8, i32 4, ptr [[TMP0]], ptr @continuation.prototype.simple_await_entry, ptr @continuation.malloc, ptr @continuation.free)
-; AWAIT-NEXT:    [[TMP3:%.*]] = call ptr @llvm.coro.begin(token [[TMP2]], ptr null)
-; AWAIT-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i32
-; AWAIT-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[CALLEE]] to ptr
-; AWAIT-NEXT:    [[TMP5:%.*]] = call ptr [[TMP4]](i32 [[CALLEE]], i32 2), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
-; AWAIT-NEXT:    [[TMP6:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TMP5]])
-; AWAIT-NEXT:    call void @lgc.cps.complete()
-; AWAIT-NEXT:    unreachable
-;
-; CORO-LABEL: define { ptr, ptr } @simple_await_entry(
-; CORO-SAME: ptr [[TMP0:%.*]]) !continuation.registercount [[META1]] !continuation.entry [[META3:![0-9]+]] !continuation [[META4:![0-9]+]] {
-; CORO-NEXT:  AllocaSpillBB:
-; CORO-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i32
-; CORO-NEXT:    [[TMP1:%.*]] = inttoptr i32 [[CALLEE]] to ptr
-; CORO-NEXT:    [[TMP2:%.*]] = call ptr [[TMP1]](i32 [[CALLEE]], i32 2), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
-; CORO-NEXT:    [[TMP3:%.*]] = insertvalue { ptr, ptr } poison, ptr @simple_await_entry.resume.0, 0
-; CORO-NEXT:    [[TMP4:%.*]] = insertvalue { ptr, ptr } [[TMP3]], ptr [[TMP2]], 1
-; CORO-NEXT:    ret { ptr, ptr } [[TMP4]]
-;
-; CLEANED-LABEL: define void @simple_await_entry(
-; CLEANED-SAME: i32 [[CSPINIT:%.*]]) !continuation.registercount [[META1]] !continuation.entry [[META4:![0-9]+]] !continuation [[META5:![0-9]+]] !continuation.state [[META1]] {
-; CLEANED-NEXT:  AllocaSpillBB:
-; CLEANED-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CLEANED-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; CLEANED-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i32
-; CLEANED-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[CALLEE]] to ptr
-; CLEANED-NEXT:    [[TMP1:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @simple_await_entry.resume.0)
-; CLEANED-NEXT:    [[TMP2:%.*]] = load i32, ptr [[CSP]], align 4
-; CLEANED-NEXT:    call void (...) @lgc.cps.jump(i32 [[CALLEE]], i32 -1, i32 [[TMP2]], i32 [[TMP1]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
-; CLEANED-NEXT:    unreachable
-;
-  %callee = ptrtoint ptr @async_fun to i32
-  call void (...) @lgc.cps.await__void(i32 %callee, i32 2), !continuation.registercount !1, !continuation.returnedRegistercount !1
-  ; Note: entry functions don't need a registercount annotation on return
-  call void @lgc.cps.complete()
-  unreachable
-}
-
-define void @await_with_arg(i32 %dummyRetAddr, i32 %i) !continuation.registercount !1 {
-; AWAIT-LABEL: define { ptr, ptr } @await_with_arg(
-; AWAIT-SAME: i32 [[DUMMYRETADDR:%.*]], i32 [[I:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1]] !continuation [[META5:![0-9]+]] {
-; AWAIT-NEXT:    [[TMP2:%.*]] = call token @llvm.coro.id.retcon(i32 8, i32 4, ptr [[TMP0]], ptr @continuation.prototype.await_with_arg, ptr @continuation.malloc, ptr @continuation.free)
-; AWAIT-NEXT:    [[TMP3:%.*]] = call ptr @llvm.coro.begin(token [[TMP2]], ptr null)
-; AWAIT-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun_with_arg to i32
-; AWAIT-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[CALLEE]] to ptr
-; AWAIT-NEXT:    [[TMP5:%.*]] = call ptr [[TMP4]](i32 [[CALLEE]], i32 2, i32 [[I]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
-; AWAIT-NEXT:    [[TMP6:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TMP5]])
-; AWAIT-NEXT:    call void (...) @lgc.cps.jump(i32 [[DUMMYRETADDR]], i32 -1, i32 poison, i32 poison), !continuation.registercount [[META1]]
-; AWAIT-NEXT:    unreachable
-;
-; CORO-LABEL: define { ptr, ptr } @await_with_arg(
-; CORO-SAME: i32 [[DUMMYRETADDR:%.*]], i32 [[I:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1]] !continuation [[META5:![0-9]+]] {
-; CORO-NEXT:  AllocaSpillBB:
-; CORO-NEXT:    [[DUMMYRETADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[AWAIT_WITH_ARG_FRAME:%.*]], ptr [[TMP0]], i32 0, i32 0
-; CORO-NEXT:    store i32 [[DUMMYRETADDR]], ptr [[DUMMYRETADDR_SPILL_ADDR]], align 4
-; CORO-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun_with_arg to i32
-; CORO-NEXT:    [[TMP1:%.*]] = inttoptr i32 [[CALLEE]] to ptr
-; CORO-NEXT:    [[TMP2:%.*]] = call ptr [[TMP1]](i32 [[CALLEE]], i32 2, i32 [[I]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
-; CORO-NEXT:    [[TMP3:%.*]] = insertvalue { ptr, ptr } poison, ptr @await_with_arg.resume.0, 0
-; CORO-NEXT:    [[TMP4:%.*]] = insertvalue { ptr, ptr } [[TMP3]], ptr [[TMP2]], 1
-; CORO-NEXT:    ret { ptr, ptr } [[TMP4]]
-;
-; CLEANED-LABEL: define void @await_with_arg(
-; CLEANED-SAME: i32 [[CSPINIT:%.*]], i32 [[DUMMYRETADDR:%.*]], i32 [[I:%.*]]) !continuation.registercount [[META1]] !continuation [[META6:![0-9]+]] !continuation.stacksize [[META3]] !continuation.state [[META3]] {
-; CLEANED-NEXT:  AllocaSpillBB:
-; CLEANED-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CLEANED-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; CLEANED-NEXT:    [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4
-; CLEANED-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 8
-; CLEANED-NEXT:    store i32 [[TMP5]], ptr [[CSP]], align 4
-; CLEANED-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(21)
-; CLEANED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP2]], i32 0
-; CLEANED-NEXT:    store i32 [[DUMMYRETADDR]], ptr addrspace(21) [[TMP3]], align 4
-; CLEANED-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun_with_arg to i32
-; CLEANED-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[CALLEE]] to ptr
-; CLEANED-NEXT:    [[TMP1:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @await_with_arg.resume.0)
-; CLEANED-NEXT:    [[TMP6:%.*]] = load i32, ptr [[CSP]], align 4
-; CLEANED-NEXT:    call void (...) @lgc.cps.jump(i32 [[CALLEE]], i32 -1, i32 [[TMP6]], i32 [[TMP1]], i32 [[I]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
-; CLEANED-NEXT:    unreachable
-;
-  %callee = ptrtoint ptr @async_fun_with_arg to i32
-  call void (...) @lgc.cps.await__void(i32 %callee, i32 2, i32 %i), !continuation.registercount !1,  !continuation.returnedRegistercount !1
-  call void (...) @lgc.cps.jump(i32 %dummyRetAddr, i32 -1, i32 poison, i32 poison), !continuation.registercount !1
-  unreachable
-}
-
-define i32 @await_with_ret_value(i32 %dummyRetAddr) !continuation.registercount !1 {
-; AWAIT-LABEL: define { ptr, ptr } @await_with_ret_value(
-; AWAIT-SAME: i32 [[DUMMYRETADDR:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1]] !continuation [[META6:![0-9]+]] {
-; AWAIT-NEXT:    [[TMP2:%.*]] = call token @llvm.coro.id.retcon(i32 8, i32 4, ptr [[TMP0]], ptr @continuation.prototype.await_with_ret_value, ptr @continuation.malloc, ptr @continuation.free)
-; AWAIT-NEXT:    [[TMP3:%.*]] = call ptr @llvm.coro.begin(token [[TMP2]], ptr null)
-; AWAIT-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i32
-; AWAIT-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[CALLEE]] to ptr
-; AWAIT-NEXT:    [[TMP5:%.*]] = call ptr [[TMP4]](i32 [[CALLEE]], i32 2), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
-; AWAIT-NEXT:    [[TMP6:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TMP5]])
-; AWAIT-NEXT:    [[TMP7:%.*]] = call { i32 } @lgc.ilcps.getReturnValue__sl_i32s()
-; AWAIT-NEXT:    [[RES_2:%.*]] = extractvalue { i32 } [[TMP7]], 0
-; AWAIT-NEXT:    call void (...) @lgc.cps.jump(i32 [[DUMMYRETADDR]], i32 -1, i32 poison, i32 poison, i32 [[RES_2]]), !continuation.registercount [[META1]]
-; AWAIT-NEXT:    unreachable
-;
-; CORO-LABEL: define { ptr, ptr } @await_with_ret_value(
-; CORO-SAME: i32 [[DUMMYRETADDR:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1]] !continuation [[META6:![0-9]+]] {
-; CORO-NEXT:  AllocaSpillBB:
-; CORO-NEXT:    [[DUMMYRETADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[AWAIT_WITH_RET_VALUE_FRAME:%.*]], ptr [[TMP0]], i32 0, i32 0
-; CORO-NEXT:    store i32 [[DUMMYRETADDR]], ptr [[DUMMYRETADDR_SPILL_ADDR]], align 4
-; CORO-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i32
-; CORO-NEXT:    [[TMP1:%.*]] = inttoptr i32 [[CALLEE]] to ptr
-; CORO-NEXT:    [[TMP2:%.*]] = call ptr [[TMP1]](i32 [[CALLEE]], i32 2), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
-; CORO-NEXT:    [[TMP3:%.*]] = insertvalue { ptr, ptr } poison, ptr @await_with_ret_value.resume.0, 0
-; CORO-NEXT:    [[TMP4:%.*]] = insertvalue { ptr, ptr } [[TMP3]], ptr [[TMP2]], 1
-; CORO-NEXT:    ret { ptr, ptr } [[TMP4]]
-;
-; CLEANED-LABEL: define void @await_with_ret_value(
-; CLEANED-SAME: i32 [[CSPINIT:%.*]], i32 [[DUMMYRETADDR:%.*]]) !continuation.registercount [[META1]] !continuation [[META7:![0-9]+]] !continuation.stacksize [[META3]] !continuation.state [[META3]] {
-; CLEANED-NEXT:  AllocaSpillBB:
-; CLEANED-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CLEANED-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; CLEANED-NEXT:    [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4
-; CLEANED-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 8
-; CLEANED-NEXT:    store i32 [[TMP5]], ptr [[CSP]], align 4
-; CLEANED-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(21)
-; CLEANED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP2]], i32 0
-; CLEANED-NEXT:    store i32 [[DUMMYRETADDR]], ptr addrspace(21) [[TMP3]], align 4
-; CLEANED-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i32
-; CLEANED-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[CALLEE]] to ptr
-; CLEANED-NEXT:    [[TMP1:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @await_with_ret_value.resume.0)
-; CLEANED-NEXT:    [[TMP6:%.*]] = load i32, ptr [[CSP]], align 4
-; CLEANED-NEXT:    call void (...) @lgc.cps.jump(i32 [[CALLEE]], i32 -1, i32 [[TMP6]], i32 [[TMP1]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
-; CLEANED-NEXT:    unreachable
-;
-  %callee = ptrtoint ptr @async_fun to i32
-  %res = call { i32 } (...) @lgc.cps.await__i32(i32 %callee, i32 2), !continuation.registercount !1, !continuation.returnedRegistercount !1
-  %res.2 = extractvalue { i32 } %res, 0
-  call void (...) @lgc.cps.jump(i32 %dummyRetAddr, i32 -1, i32 poison, i32 poison, i32 %res.2), !continuation.registercount !1
-  unreachable
-}
-
-define void @wait_await(i32 %dummyRetAddr) !continuation.registercount !1 {
-; AWAIT-LABEL: define { ptr, ptr } @wait_await(
-; AWAIT-SAME: i32 [[DUMMYRETADDR:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1]] !continuation [[META7:![0-9]+]] {
-; AWAIT-NEXT:    [[TMP2:%.*]] = call token @llvm.coro.id.retcon(i32 8, i32 4, ptr [[TMP0]], ptr @continuation.prototype.wait_await, ptr @continuation.malloc, ptr @continuation.free)
-; AWAIT-NEXT:    [[TMP3:%.*]] = call ptr @llvm.coro.begin(token [[TMP2]], ptr null)
-; AWAIT-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun_with_waitmask to i32
-; AWAIT-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[CALLEE]] to ptr
-; AWAIT-NEXT:    [[TMP5:%.*]] = call ptr [[TMP4]](i32 [[CALLEE]], i32 2), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]], !waitmask [[META8:![0-9]+]]
-; AWAIT-NEXT:    [[TMP6:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TMP5]])
-; AWAIT-NEXT:    call void (...) @lgc.cps.jump(i32 [[DUMMYRETADDR]], i32 -1, i32 poison, i32 poison, i32 poison), !continuation.registercount [[META1]]
-; AWAIT-NEXT:    unreachable
-;
-; CORO-LABEL: define { ptr, ptr } @wait_await(
-; CORO-SAME: i32 [[DUMMYRETADDR:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1]] !continuation [[META7:![0-9]+]] {
-; CORO-NEXT:  AllocaSpillBB:
-; CORO-NEXT:    [[DUMMYRETADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[WAIT_AWAIT_FRAME:%.*]], ptr [[TMP0]], i32 0, i32 0
-; CORO-NEXT:    store i32 [[DUMMYRETADDR]], ptr [[DUMMYRETADDR_SPILL_ADDR]], align 4
-; CORO-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun_with_waitmask to i32
-; CORO-NEXT:    [[TMP1:%.*]] = inttoptr i32 [[CALLEE]] to ptr
-; CORO-NEXT:    [[TMP2:%.*]] = call ptr [[TMP1]](i32 [[CALLEE]], i32 2), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]], !waitmask [[META8:![0-9]+]]
-; CORO-NEXT:    [[TMP3:%.*]] = insertvalue { ptr, ptr } poison, ptr @wait_await.resume.0, 0
-; CORO-NEXT:    [[TMP4:%.*]] = insertvalue { ptr, ptr } [[TMP3]], ptr [[TMP2]], 1
-; CORO-NEXT:    ret { ptr, ptr } [[TMP4]]
-;
-; CLEANED-LABEL: define void @wait_await(
-; CLEANED-SAME: i32 [[CSPINIT:%.*]], i32 [[DUMMYRETADDR:%.*]]) !continuation.registercount [[META1]] !continuation [[META8:![0-9]+]] !continuation.stacksize [[META3]] !continuation.state [[META3]] {
-; CLEANED-NEXT:  AllocaSpillBB:
-; CLEANED-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CLEANED-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; CLEANED-NEXT:    [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4
-; CLEANED-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 8
-; CLEANED-NEXT:    store i32 [[TMP5]], ptr [[CSP]], align 4
-; CLEANED-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(21)
-; CLEANED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP2]], i32 0
-; CLEANED-NEXT:    store i32 [[DUMMYRETADDR]], ptr addrspace(21) [[TMP3]], align 4
-; CLEANED-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun_with_waitmask to i32
-; CLEANED-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[CALLEE]] to ptr
-; CLEANED-NEXT:    [[TMP1:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @wait_await.resume.0)
-; CLEANED-NEXT:    [[TMP6:%.*]] = load i32, ptr [[CSP]], align 4
-; CLEANED-NEXT:    call void (...) @lgc.cps.jump(i32 [[CALLEE]], i32 -1, i32 [[TMP6]], i32 [[TMP1]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]], !waitmask [[META9:![0-9]+]]
-; CLEANED-NEXT:    unreachable
-;
-  %callee = ptrtoint ptr @async_fun_with_waitmask to i32
-  call void (...) @lgc.cps.await__void(i32 %callee, i32 2), !waitmask !3, !continuation.registercount !1, !continuation.returnedRegistercount !1
-  call void (...) @lgc.cps.jump(i32 %dummyRetAddr, i32 -1, i32 poison, i32 poison, i32 poison), !continuation.registercount !1
-  unreachable
-}
-
-!continuation.stackAddrspace = !{!2}
-
-!0 = !{}
-!1 = !{i32 0}
-!2 = !{i32 21}
-!3 = !{i32 -1}
diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-exit-raygen.ll b/llvmraytracing/test/dx/lower-rt-pipeline-exit-raygen.ll
deleted file mode 100644
index 7b412f68e3..0000000000
--- a/llvmraytracing/test/dx/lower-rt-pipeline-exit-raygen.ll
+++ /dev/null
@@ -1,95 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
-; RUN: opt --verify-each -passes="lower-raytracing-pipeline,lint,remove-types-metadata" -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-%dx.types.Handle = type { i8* }
-%struct.DispatchSystemData = type { <3 x i32> }
-%struct.TraversalData = type { %struct.SystemData, %struct.HitData, <3 x float>, <3 x float>, float, i64 }
-%struct.SystemData = type { %struct.DispatchSystemData, %struct.BuiltInTriangleIntersectionAttributes }
-%struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
-%struct.HitData = type { float, i32 }
-%struct.RaytracingAccelerationStructure = type { i32 }
-%"class.RWTexture2D<vector<float, 4> >" = type { <4 x float> }
-
-@"\01?Scene@@3URaytracingAccelerationStructure@@A" = external constant %dx.types.Handle, align 4
-@"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" = external constant %dx.types.Handle, align 4
-
-declare void @lgc.ilcps.waitContinue(...) noreturn
-
-declare !pointeetys !24 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
-
-declare !pointeetys !24 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data)
-
-declare !pointeetys !27 i1 @_cont_ReportHit(%struct.TraversalData* %data, float %t, i32 %hitKind)
-
-define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !pointeetys !{%struct.DispatchSystemData poison} {
-  %dispatchPayloadPtr = getelementptr inbounds %struct.DispatchSystemData, ptr %data, i32 0, i32 0
-  %dispatchPayload = load <3 x i32>, ptr %dispatchPayloadPtr, align 4
-  %deadLaneDispatchPayload = insertelement <3 x i32> %dispatchPayload, i32 -11, i32 0
-  %systemData = insertvalue %struct.SystemData poison, <3 x i32> %deadLaneDispatchPayload, 0, 0
-  %addrSuffix = load i32, ptr %data, align 4
-  %addr = zext i32 %addrSuffix to i64
-  call void @lgc.ilcps.waitContinue(i64 %addr, i64 -1, %struct.SystemData %systemData)
-  unreachable
-}
-
-; Function Attrs: nounwind
-define void @MyRayGen() #0 !lgc.rt.shaderstage !20 {
-  ret void
-}
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!llvm.ident = !{!0}
-!dx.version = !{!1}
-!dx.valver = !{!1}
-!dx.shaderModel = !{!2}
-!dx.resources = !{!3}
-!dx.typeAnnotations = !{!10}
-!dx.entryPoints = !{!21}
-
-!0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"}
-!1 = !{i32 1, i32 6}
-!2 = !{!"lib", i32 6, i32 6}
-!3 = !{!4, !7, null, null}
-!4 = !{!5}
-!5 = !{i32 0, %struct.RaytracingAccelerationStructure* bitcast (%dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A" to %struct.RaytracingAccelerationStructure*), !"Scene", i32 0, i32 0, i32 1, i32 16, i32 0, !6}
-!6 = !{i32 0, i32 4}
-!7 = !{!8}
-!8 = !{i32 0, %"class.RWTexture2D<vector<float, 4> >"* bitcast (%dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" to %"class.RWTexture2D<vector<float, 4> >"*), !"RenderTarget", i32 0, i32 0, i32 1, i32 2, i1 false, i1 false, i1 false, !9}
-!9 = !{i32 0, i32 9}
-!10 = !{i32 1, void ()* @MyRayGen, !11}
-!11 = !{!12}
-!12 = !{i32 1, !13, !13}
-!13 = !{}
-!14 = !{!12, !15, !16}
-!15 = !{i32 2, !13, !13}
-!16 = !{i32 0, !13, !13}
-!17 = !{!12, !15}
-!18 = !{null, !"", null, !3, !19}
-!19 = !{i32 0, i64 65536}
-!20 = !{i32 0}
-!21 = !{void ()* @MyRayGen, !"MyRayGen", null, null, !22}
-!22 = !{i32 8, i32 7, i32 5, !23}
-!23 = !{i32 0}
-!24 = !{%struct.DispatchSystemData poison}
-!25 = !{i32 0, %struct.DispatchSystemData poison}
-!26 = !{i32 0, %struct.TraversalData poison}
-!27 = !{%struct.TraversalData poison}
-; LOWERRAYTRACINGPIPELINE-LABEL: define void @MyRayGen(
-; LOWERRAYTRACINGPIPELINE-SAME: i32 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR1:[0-9]+]] !lgc.rt.shaderstage [[META16:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation.registercount [[META16]] !continuation [[META19:![0-9]+]] {
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [0 x i32], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[DISPATCHPAYLOAD_I:%.*]] = load <3 x i32>, ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[DEADLANEDISPATCHPAYLOAD_I:%.*]] = insertelement <3 x i32> [[DISPATCHPAYLOAD_I]], i32 -11, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEMDATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] poison, <3 x i32> [[DEADLANEDISPATCHPAYLOAD_I]], 0, 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[ADDRSUFFIX_I:%.*]] = load i32, ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[ADDR_I:%.*]] = zext i32 [[ADDRSUFFIX_I]] to i64
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @lgc.ilcps.waitContinue(i64 [[ADDR_I]], i64 -1, [[STRUCT_SYSTEMDATA]] [[SYSTEMDATA_I]]) #[[ATTR3:[0-9]+]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
-; LOWERRAYTRACINGPIPELINE:       _cont_ExitRayGen.exit:
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @lgc.cps.complete()
-; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
-;
diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-small-payload-field.ll b/llvmraytracing/test/dx/lower-rt-pipeline-small-payload-field.ll
deleted file mode 100644
index b8c0ff6c29..0000000000
--- a/llvmraytracing/test/dx/lower-rt-pipeline-small-payload-field.ll
+++ /dev/null
@@ -1,259 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
-; Test copying of fields between local and global payload whose size
-; is not a multiple of i32s, requiring copies at a smaller granularity
-; for at least a suffix of the fields.
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck %s
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-; This payload struct is PAQed as follows:
-; struct [raypayload] Payload
-; {
-;     int v[5]                 : write(caller) : read(miss, caller);
-;     min16uint smallField     : write(miss)   : read(caller);
-;     min16uint3 smallFieldVec : write(miss)   : read(caller);
-; };
-; The last two fields are particularly relevant.
-; The i16 needs special treatment, as well as the last two bytes of the <3 x i16>.
-%struct.PAQPayload = type { [5 x i32], i16, <3 x i16> }
-; Identical, but without PAQ:
-%struct.NoPAQPayload = type { [5 x i32], i16, <3 x i16> }
-%struct.DispatchSystemData = type { i32 }
-%struct.TraversalData = type { %struct.SystemData }
-%struct.SystemData = type { %struct.DispatchSystemData }
-%struct.AnyHitTraversalData = type { %struct.TraversalData, %struct.HitData }
-%struct.HitData = type { float, i32 }
-%struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
-
-; Function Attrs: nounwind
-define void @MissPAQ(%struct.PAQPayload* noalias nocapture %payload) #0 !pointeetys !17 {
-  %1 = getelementptr inbounds %struct.PAQPayload, %struct.PAQPayload* %payload, i32 0, i32 1
-  store i16 17, i16* %1, align 4
-  ret void
-}
-
-; Function Attrs: nounwind
-define void @MissNoPAQ(%struct.NoPAQPayload* noalias nocapture %payload) #0 !pointeetys !31 {
-  %1 = getelementptr inbounds %struct.NoPAQPayload, %struct.NoPAQPayload* %payload, i32 0, i32 1
-  store i16 17, i16* %1, align 4
-  ret void
-}
-
-; Function Attrs: alwaysinline
-declare !pointeetys !19 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData*) #1
-
-; Function Attrs: alwaysinline
-declare !pointeetys !21 void @_cont_SetTriangleHitAttributes(%struct.SystemData*, %struct.BuiltInTriangleIntersectionAttributes) #1
-
-; Function Attrs: alwaysinline
-declare !pointeetys !22 i1 @_cont_IsEndSearch(%struct.TraversalData*) #1
-
-; Function Attrs: nounwind memory(read)
-declare !pointeetys !24 i32 @_cont_HitKind(%struct.SystemData* nocapture readnone, %struct.HitData*) #2
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !26 void @_AmdRestoreSystemData(%struct.DispatchSystemData*) #3
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !28 void @_AmdRestoreSystemDataAnyHit(%struct.AnyHitTraversalData*) #3
-
-declare !pointeetys !30 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
-
-; Function Attrs: alwaysinline
-define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) #1 !pointeetys !30 {
-  ret i32 5
-}
-
-declare !pointeetys !31 i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind)
-
-attributes #0 = { nounwind }
-attributes #1 = { alwaysinline }
-attributes #2 = { nounwind memory(read) }
-attributes #3 = { nounwind memory(none) }
-
-!llvm.ident = !{!0}
-!dx.version = !{!1}
-!dx.valver = !{!1}
-!dx.shaderModel = !{!2}
-!dx.typeAnnotations = !{!3}
-!dx.dxrPayloadAnnotations = !{!8}
-!dx.entryPoints = !{!12, !14, !33}
-
-!0 = !{!"dxcoob 2019.05.00"}
-!1 = !{i32 1, i32 7}
-!2 = !{!"lib", i32 6, i32 7}
-!3 = !{i32 1, void (%struct.PAQPayload*)* @MissPAQ, !4}
-!4 = !{!5, !7}
-!5 = !{i32 1, !6, !6}
-!6 = !{}
-!7 = !{i32 2, !6, !6}
-!8 = !{i32 0, %struct.PAQPayload undef, !9}
-!9 = !{!10, !11, !11}
-!10 = !{i32 0, i32 259}
-!11 = !{i32 0, i32 513}
-!12 = !{null, !"", null, null, !13}
-!13 = !{i32 0, i64 32}
-!14 = !{void (%struct.PAQPayload*)* @MissPAQ, !"MissPAQ", null, null, !15}
-!15 = !{i32 8, i32 11, i32 6, i32 24, i32 5, !16}
-!16 = !{i32 0}
-!17 = !{%struct.PAQPayload poison}
-!18 = !{i32 0, %struct.PAQPayload poison}
-!19 = !{%struct.SystemData poison}
-!20 = !{i32 0, %struct.SystemData poison}
-!21 = !{%struct.SystemData poison}
-!22 = !{%struct.TraversalData poison}
-!23 = !{i32 0, %struct.TraversalData poison}
-!24 = !{null, %struct.SystemData poison, %struct.HitData poison}
-!25 = !{i32 0, %struct.HitData poison}
-!26 = !{%struct.DispatchSystemData poison}
-!27 = !{i32 0, %struct.DispatchSystemData poison}
-!28 = !{%struct.AnyHitTraversalData poison}
-!29 = !{i32 0, %struct.AnyHitTraversalData poison}
-!30 = !{%struct.DispatchSystemData poison}
-!31 = !{%struct.NoPAQPayload poison}
-!32 = !{i32 0, %struct.NoPAQPayload poison}
-!33 = !{void (%struct.NoPAQPayload*)* @MissNoPAQ, !"MissNoPAQ", null, null, !34}
-!34 = !{i32 8, i32 11, i32 6, i32 24, i32 5, !35}
-!35 = !{i32 0}
-
-; CHECK-LABEL: define void @MissPAQ(
-; CHECK-SAME: i32 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [16 x i32] [[PADDING:%.*]], [11 x i32] [[PAYLOAD:%.*]]) #[[ATTR0:[0-9]+]] !lgc.rt.shaderstage [[META21:![0-9]+]] !continuation.registercount [[META22:![0-9]+]] !continuation [[META23:![0-9]+]] {
-; CHECK-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
-; CHECK-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [11 x i32], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_PAYLOAD:%.*]], align 8
-; CHECK-NEXT:    store [11 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-NEXT:    store [[STRUCT_SYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_PAYLOAD]], ptr [[TMP2]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-NEXT:    store i32 [[TMP7]], ptr [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 4
-; CHECK-NEXT:    store i32 [[TMP10]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 1
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP19]], align 4
-; CHECK-NEXT:    store i32 [[TMP12]], ptr [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 2
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP25]], align 4
-; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 3
-; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 3
-; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP34]], align 4
-; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP15]], align 4
-; CHECK-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_PAYLOAD]], ptr [[TMP2]], i32 0, i32 1
-; CHECK-NEXT:    store i16 17, ptr [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_PAYLOAD]], ptr [[TMP2]], i32 0, i32 1
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 1
-; CHECK-NEXT:    [[TMP21:%.*]] = load i8, ptr [[TMP18]], align 1
-; CHECK-NEXT:    store i8 [[TMP21]], ptr [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr [[TMP20]], i32 1
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[TMP18]], i32 1
-; CHECK-NEXT:    [[TMP23:%.*]] = load i8, ptr [[TMP22]], align 1
-; CHECK-NEXT:    store i8 [[TMP23]], ptr [[TMP35]], align 1
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_PAYLOAD]], ptr [[TMP2]], i32 0, i32 2
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 2
-; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP24]], align 4
-; CHECK-NEXT:    store i32 [[TMP27]], ptr [[TMP26]], align 4
-; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[TMP26]], i32 4
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[TMP24]], i32 4
-; CHECK-NEXT:    [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1
-; CHECK-NEXT:    store i8 [[TMP29]], ptr [[TMP37]], align 1
-; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr [[TMP26]], i32 5
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[TMP24]], i32 5
-; CHECK-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
-; CHECK-NEXT:    store i8 [[TMP31]], ptr [[TMP38]], align 1
-; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP33:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP32]], align 4
-; CHECK-NEXT:    [[TMP36:%.*]] = load [11 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP33]], [16 x i32] poison, [11 x i32] [[TMP36]]), !continuation.registercount [[META22]]
-; CHECK-NEXT:    unreachable
-;
-;
-; CHECK-LABEL: define void @MissNoPAQ(
-; CHECK-SAME: i32 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [16 x i32] [[PADDING:%.*]], [14 x i32] [[PAYLOAD:%.*]]) #[[ATTR0]] !lgc.rt.shaderstage [[META21]] !continuation.registercount [[META19:![0-9]+]] !continuation [[META24:![0-9]+]] {
-; CHECK-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
-; CHECK-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [14 x i32], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_NOPAQPAYLOAD:%.*]], align 8
-; CHECK-NEXT:    store [14 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-NEXT:    store [[STRUCT_SYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_NOPAQPAYLOAD]], ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-NEXT:    store i32 [[TMP5]], ptr [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP17]], align 4
-; CHECK-NEXT:    store i32 [[TMP9]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP23]], align 4
-; CHECK-NEXT:    store i32 [[TMP11]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 2
-; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP26]], align 4
-; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 3
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 3
-; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP16]], align 4
-; CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP12]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 4
-; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP28]], align 4
-; CHECK-NEXT:    store i32 [[TMP21]], ptr [[TMP14]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 5
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 5
-; CHECK-NEXT:    [[TMP54:%.*]] = load i32, ptr [[TMP22]], align 4
-; CHECK-NEXT:    store i32 [[TMP54]], ptr [[TMP18]], align 4
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 6
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 6
-; CHECK-NEXT:    [[TMP55:%.*]] = load i32, ptr [[TMP31]], align 4
-; CHECK-NEXT:    store i32 [[TMP55]], ptr [[TMP20]], align 4
-; CHECK-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_NOPAQPAYLOAD]], ptr [[TMP2]], i32 0, i32 1
-; CHECK-NEXT:    store i16 17, ptr [[TMP24]], align 4
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_NOPAQPAYLOAD]], ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP25]], align 4
-; CHECK-NEXT:    store i32 [[TMP29]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 1
-; CHECK-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP27]], align 4
-; CHECK-NEXT:    store i32 [[TMP32]], ptr [[TMP30]], align 4
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP30]], i32 1
-; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i32, ptr [[TMP34]], align 4
-; CHECK-NEXT:    store i32 [[TMP35]], ptr [[TMP33]], align 4
-; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP30]], i32 2
-; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 2
-; CHECK-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP37]], align 4
-; CHECK-NEXT:    store i32 [[TMP38]], ptr [[TMP36]], align 4
-; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP30]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 3
-; CHECK-NEXT:    [[TMP41:%.*]] = load i32, ptr [[TMP40]], align 4
-; CHECK-NEXT:    store i32 [[TMP41]], ptr [[TMP39]], align 4
-; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[TMP30]], i32 4
-; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 4
-; CHECK-NEXT:    [[TMP44:%.*]] = load i32, ptr [[TMP43]], align 4
-; CHECK-NEXT:    store i32 [[TMP44]], ptr [[TMP42]], align 4
-; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, ptr [[TMP30]], i32 5
-; CHECK-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 5
-; CHECK-NEXT:    [[TMP47:%.*]] = load i32, ptr [[TMP52]], align 4
-; CHECK-NEXT:    store i32 [[TMP47]], ptr [[TMP51]], align 4
-; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, ptr [[TMP30]], i32 6
-; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 6
-; CHECK-NEXT:    [[TMP50:%.*]] = load i32, ptr [[TMP49]], align 4
-; CHECK-NEXT:    store i32 [[TMP50]], ptr [[TMP48]], align 4
-; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP46:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP45]], align 4
-; CHECK-NEXT:    [[TMP53:%.*]] = load [14 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP46]], [16 x i32] poison, [14 x i32] [[TMP53]]), !continuation.registercount [[META19]]
-; CHECK-NEXT:    unreachable
-;
-;
-; CHECK-LABEL: define i32 @_cont_GetLocalRootIndex(
-; CHECK-SAME: ptr [[DATA:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT:    ret i32 5
-;
diff --git a/llvmraytracing/test/dx/paq-hit-attribute-size.ll b/llvmraytracing/test/dx/paq-hit-attribute-size.ll
deleted file mode 100644
index fe0de858d5..0000000000
--- a/llvmraytracing/test/dx/paq-hit-attribute-size.ll
+++ /dev/null
@@ -1,992 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; Test payload serialization layouts in presence of different max hit attribute
-; size metadata.
-;
-; Default run checking serialization layouts and their usage:
-; RUN: grep -v 'NOT-1' %s | opt -debug-only=lower-raytracing-pipeline --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,continuations-lint,remove-types-metadata' -S --lint-abort-on-error 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-MAX-1
-; RUN: grep -v 'NOT-2' %s | opt -debug-only=lower-raytracing-pipeline --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,continuations-lint,remove-types-metadata' -S --lint-abort-on-error 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-MAX-2
-; RUN: grep -v 'NOT-4' %s | opt -debug-only=lower-raytracing-pipeline --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,continuations-lint,remove-types-metadata' -S --lint-abort-on-error 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-MAX-4
-; RUN: grep -v 'NOT-8' %s | opt -debug-only=lower-raytracing-pipeline --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,continuations-lint,remove-types-metadata' -S --lint-abort-on-error 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-MAX-8
-
-; Check that hit attributes violating the max size (here: 2 Dwords, set by removing lines containing NOT-2) are detected and crash.
-; Note: The padding computation will fail before the actual hit attribute check in copyHitAttributes, because we are using more-than-expected storage
-; for the hit attributes. So, we only check for an assertion to occur.
-; RUN: grep -v 'NOT-INVALID' %s | not --crash opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,continuations-lint,remove-types-metadata' -S --lint-abort-on-error 2>&1 | FileCheck %s --check-prefix INVALID
-; REQUIRES: assertions
-
-; INVALID: Assertion
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-%dx.types.Handle = type { i8* }
-%struct.MyPayload = type { float, i32, double }
-%struct.Attributes1DWords = type { [1 x i32] }
-%struct.Attributes2DWords = type { [2 x i32] }
-%struct.Attributes4DWords = type { [4 x i32] }
-%struct.Attributes8DWords = type { [8 x i32] }
-%dx.types.ResourceProperties = type { i32, i32 }
-%struct.DispatchSystemData = type { i32 }
-%struct.TraversalData = type { %struct.SystemData }
-%struct.SystemData = type { %struct.DispatchSystemData }
-%struct.AnyHitTraversalData = type { %struct.TraversalData, %struct.HitData }
-%struct.HitData = type { float, i32 }
-%struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
-%struct.AnyHitSystemData = type { %struct.SystemData, %struct.PrimitiveSystemState }
-%struct.PrimitiveSystemState = type { float, i32, i32, i32 }
-%struct.RaytracingAccelerationStructure = type { i32 }
-%"class.RWTexture2D<vector<float, 4> >" = type { <4 x float> }
-
-@"\01?myAccelerationStructure@@3URaytracingAccelerationStructure@@A" = external constant %dx.types.Handle, align 4
-@"\01?gOutput@@3V?$RWTexture2D@V?$vector@M$03@@@@A" = external constant %dx.types.Handle, align 4
-
-; If the app uses only 1 DWord for hit attributes, then the layout does not get smaller.
-; Instead, one 1 DWord in system data is unused.
-; CHECK-MAX-1-DAG: %struct.MyPayload.attr_max_1_i32s.layout_0_caller_out = type { [4 x i32] }
-; CHECK-MAX-2-DAG: %struct.MyPayload.attr_max_2_i32s.layout_0_caller_out = type { [4 x i32] }
-; CHECK-MAX-4-DAG: %struct.MyPayload.attr_max_4_i32s.layout_0_caller_out = type { [6 x i32] }
-; CHECK-MAX-8-DAG: %struct.MyPayload.attr_max_8_i32s.layout_0_caller_out = type { [10 x i32] }
-
-define void @AnyHit1DWords(%struct.MyPayload* %payload, %struct.Attributes1DWords* %attrs) !pointeetys !60 {
-; CHECK-MAX-1-LABEL: define void @AnyHit1DWords(
-; CHECK-MAX-1-SAME: i32 [[RETURNADDR:%.*]], [[STRUCT_ANYHITSYSTEMDATA:%.*]] [[TMP0:%.*]], [[STRUCT_ATTRIBUTES1DWORDS:%.*]] [[TMP1:%.*]], [1 x i32] [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META18:![0-9]+]] !continuation.registercount [[META15:![0-9]+]] !continuation [[META19:![0-9]+]] {
-; CHECK-MAX-1-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
-; CHECK-MAX-1-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
-; CHECK-MAX-1-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITSYSTEMDATA]], align 8
-; CHECK-MAX-1-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [4 x i32], align 4
-; CHECK-MAX-1-NEXT:    [[TMP5:%.*]] = alloca [[STRUCT_MYPAYLOAD:%.*]], align 8
-; CHECK-MAX-1-NEXT:    [[ORIGHITATTRS:%.*]] = alloca [8 x i32], align 4
-; CHECK-MAX-1-NEXT:    [[HITATTRSALLOCA:%.*]] = alloca [[STRUCT_ATTRIBUTES1DWORDS]], align 8
-; CHECK-MAX-1-NEXT:    store [4 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-1-NEXT:    store [[STRUCT_ANYHITSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-MAX-1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
-; CHECK-MAX-1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP5]], i32 0
-; CHECK-MAX-1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-1-NEXT:    store i32 [[TMP8]], ptr [[TMP7]], align 4
-; CHECK-MAX-1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
-; CHECK-MAX-1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 1
-; CHECK-MAX-1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
-; CHECK-MAX-1-NEXT:    store i32 [[TMP11]], ptr [[TMP9]], align 4
-; CHECK-MAX-1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 2
-; CHECK-MAX-1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 2
-; CHECK-MAX-1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
-; CHECK-MAX-1-NEXT:    store i32 [[TMP14]], ptr [[TMP12]], align 4
-; CHECK-MAX-1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 3
-; CHECK-MAX-1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 3
-; CHECK-MAX-1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
-; CHECK-MAX-1-NEXT:    store i32 [[TMP17]], ptr [[TMP15]], align 4
-; CHECK-MAX-1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; CHECK-MAX-1-NEXT:    [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP18]])
-; CHECK-MAX-1-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP19]], ptr [[TMP4]], align 4
-; CHECK-MAX-1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP4]], align 4
-; CHECK-MAX-1-NEXT:    store i32 [[TMP20]], ptr [[ORIGHITATTRS]], align 4
-; CHECK-MAX-1-NEXT:    store [[STRUCT_ATTRIBUTES1DWORDS]] [[TMP1]], ptr [[HITATTRSALLOCA]], align 4
-; CHECK-MAX-1-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
-; CHECK-MAX-1-NEXT:    call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]])
-; CHECK-MAX-1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP5]], i32 0
-; CHECK-MAX-1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
-; CHECK-MAX-1-NEXT:    store i32 [[TMP22]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 1
-; CHECK-MAX-1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 1
-; CHECK-MAX-1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4
-; CHECK-MAX-1-NEXT:    store i32 [[TMP25]], ptr [[TMP23]], align 4
-; CHECK-MAX-1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 2
-; CHECK-MAX-1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 2
-; CHECK-MAX-1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
-; CHECK-MAX-1-NEXT:    store i32 [[TMP28]], ptr [[TMP26]], align 4
-; CHECK-MAX-1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 3
-; CHECK-MAX-1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 3
-; CHECK-MAX-1-NEXT:    [[TMP31:%.*]] = load i32, ptr [[TMP30]], align 4
-; CHECK-MAX-1-NEXT:    store i32 [[TMP31]], ptr [[TMP29]], align 4
-; CHECK-MAX-1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
-; CHECK-MAX-1-NEXT:    store i32 [[TMP32]], ptr [[TMP3]], align 4
-; CHECK-MAX-1-NEXT:    [[TMP33:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP3]], align 4
-; CHECK-MAX-1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; CHECK-MAX-1-NEXT:    call void @_cont_SetTriangleHitAttributes(ptr [[TMP34]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP33]])
-; CHECK-MAX-1-NEXT:    [[TMP35:%.*]] = load [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-MAX-1-NEXT:    [[TMP36:%.*]] = load [4 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-1-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, i32 poison, i32 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP35]], [2 x i32] poison, [4 x i32] [[TMP36]]), !continuation.registercount [[META15]]
-; CHECK-MAX-1-NEXT:    unreachable
-;
-; CHECK-MAX-2-LABEL: define void @AnyHit1DWords(
-; CHECK-MAX-2-SAME: i32 [[RETURNADDR:%.*]], [[STRUCT_ANYHITSYSTEMDATA:%.*]] [[TMP0:%.*]], [[STRUCT_ATTRIBUTES1DWORDS:%.*]] [[TMP1:%.*]], [1 x i32] [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META20:![0-9]+]] !continuation.registercount [[META18:![0-9]+]] !continuation [[META21:![0-9]+]] {
-; CHECK-MAX-2-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
-; CHECK-MAX-2-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
-; CHECK-MAX-2-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITSYSTEMDATA]], align 8
-; CHECK-MAX-2-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [4 x i32], align 4
-; CHECK-MAX-2-NEXT:    [[TMP5:%.*]] = alloca [[STRUCT_MYPAYLOAD:%.*]], align 8
-; CHECK-MAX-2-NEXT:    [[ORIGHITATTRS:%.*]] = alloca [8 x i32], align 4
-; CHECK-MAX-2-NEXT:    [[HITATTRSALLOCA:%.*]] = alloca [[STRUCT_ATTRIBUTES1DWORDS]], align 8
-; CHECK-MAX-2-NEXT:    store [4 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-2-NEXT:    store [[STRUCT_ANYHITSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-MAX-2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
-; CHECK-MAX-2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP5]], i32 0
-; CHECK-MAX-2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-2-NEXT:    store i32 [[TMP8]], ptr [[TMP7]], align 4
-; CHECK-MAX-2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
-; CHECK-MAX-2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 1
-; CHECK-MAX-2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
-; CHECK-MAX-2-NEXT:    store i32 [[TMP11]], ptr [[TMP9]], align 4
-; CHECK-MAX-2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 2
-; CHECK-MAX-2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 2
-; CHECK-MAX-2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
-; CHECK-MAX-2-NEXT:    store i32 [[TMP14]], ptr [[TMP12]], align 4
-; CHECK-MAX-2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 3
-; CHECK-MAX-2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 3
-; CHECK-MAX-2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
-; CHECK-MAX-2-NEXT:    store i32 [[TMP17]], ptr [[TMP15]], align 4
-; CHECK-MAX-2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; CHECK-MAX-2-NEXT:    [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP18]])
-; CHECK-MAX-2-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP19]], ptr [[TMP4]], align 4
-; CHECK-MAX-2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP4]], align 4
-; CHECK-MAX-2-NEXT:    store i32 [[TMP20]], ptr [[ORIGHITATTRS]], align 4
-; CHECK-MAX-2-NEXT:    store [[STRUCT_ATTRIBUTES1DWORDS]] [[TMP1]], ptr [[HITATTRSALLOCA]], align 4
-; CHECK-MAX-2-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
-; CHECK-MAX-2-NEXT:    call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]])
-; CHECK-MAX-2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP5]], i32 0
-; CHECK-MAX-2-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
-; CHECK-MAX-2-NEXT:    store i32 [[TMP22]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 1
-; CHECK-MAX-2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 1
-; CHECK-MAX-2-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4
-; CHECK-MAX-2-NEXT:    store i32 [[TMP25]], ptr [[TMP23]], align 4
-; CHECK-MAX-2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 2
-; CHECK-MAX-2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 2
-; CHECK-MAX-2-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
-; CHECK-MAX-2-NEXT:    store i32 [[TMP28]], ptr [[TMP26]], align 4
-; CHECK-MAX-2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 3
-; CHECK-MAX-2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 3
-; CHECK-MAX-2-NEXT:    [[TMP31:%.*]] = load i32, ptr [[TMP30]], align 4
-; CHECK-MAX-2-NEXT:    store i32 [[TMP31]], ptr [[TMP29]], align 4
-; CHECK-MAX-2-NEXT:    [[TMP32:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
-; CHECK-MAX-2-NEXT:    store i32 [[TMP32]], ptr [[TMP3]], align 4
-; CHECK-MAX-2-NEXT:    [[TMP33:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP3]], align 4
-; CHECK-MAX-2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; CHECK-MAX-2-NEXT:    call void @_cont_SetTriangleHitAttributes(ptr [[TMP34]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP33]])
-; CHECK-MAX-2-NEXT:    [[TMP35:%.*]] = load [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-MAX-2-NEXT:    [[TMP36:%.*]] = load [4 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-2-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, i32 poison, i32 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP35]], [2 x i32] poison, [4 x i32] [[TMP36]]), !continuation.registercount [[META18]]
-; CHECK-MAX-2-NEXT:    unreachable
-;
-; CHECK-MAX-4-LABEL: define void @AnyHit1DWords(
-; CHECK-MAX-4-SAME: i32 [[RETURNADDR:%.*]], [[STRUCT_ANYHITSYSTEMDATA:%.*]] [[TMP0:%.*]], [[STRUCT_ATTRIBUTES1DWORDS:%.*]] [[TMP1:%.*]], [3 x i32] [[PADDING:%.*]], [6 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META21:![0-9]+]] !continuation.registercount [[META19:![0-9]+]] !continuation [[META22:![0-9]+]] {
-; CHECK-MAX-4-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
-; CHECK-MAX-4-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
-; CHECK-MAX-4-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITSYSTEMDATA]], align 8
-; CHECK-MAX-4-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [6 x i32], align 4
-; CHECK-MAX-4-NEXT:    [[TMP5:%.*]] = alloca [[STRUCT_MYPAYLOAD:%.*]], align 8
-; CHECK-MAX-4-NEXT:    [[ORIGHITATTRS:%.*]] = alloca [8 x i32], align 4
-; CHECK-MAX-4-NEXT:    [[HITATTRSALLOCA:%.*]] = alloca [[STRUCT_ATTRIBUTES1DWORDS]], align 8
-; CHECK-MAX-4-NEXT:    store [6 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-4-NEXT:    store [[STRUCT_ANYHITSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
-; CHECK-MAX-4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP5]], i32 0
-; CHECK-MAX-4-NEXT:    [[TMP8:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP8]], ptr [[TMP7]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 3
-; CHECK-MAX-4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
-; CHECK-MAX-4-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP11]], ptr [[TMP10]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
-; CHECK-MAX-4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 1
-; CHECK-MAX-4-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP14]], ptr [[TMP12]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 2
-; CHECK-MAX-4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 2
-; CHECK-MAX-4-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP17]], ptr [[TMP15]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; CHECK-MAX-4-NEXT:    [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP18]])
-; CHECK-MAX-4-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP19]], ptr [[TMP4]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD_ATTR_MAX_4_I32S_LAYOUT_1_ANYHIT_IN:%.*]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1
-; CHECK-MAX-4-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP4]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP21]], ptr [[ORIGHITATTRS]], align 4
-; CHECK-MAX-4-NEXT:    store [[STRUCT_ATTRIBUTES1DWORDS]] [[TMP1]], ptr [[HITATTRSALLOCA]], align 4
-; CHECK-MAX-4-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
-; CHECK-MAX-4-NEXT:    call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]])
-; CHECK-MAX-4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP5]], i32 0
-; CHECK-MAX-4-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP23]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 3
-; CHECK-MAX-4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 1
-; CHECK-MAX-4-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP26]], ptr [[TMP24]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 1
-; CHECK-MAX-4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 1
-; CHECK-MAX-4-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP29]], ptr [[TMP27]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 2
-; CHECK-MAX-4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 2
-; CHECK-MAX-4-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP32]], ptr [[TMP30]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP33:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP33]], ptr [[TMP3]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP34:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP3]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; CHECK-MAX-4-NEXT:    call void @_cont_SetTriangleHitAttributes(ptr [[TMP35]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP34]])
-; CHECK-MAX-4-NEXT:    [[TMP36:%.*]] = load [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP37:%.*]] = load [6 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-4-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, i32 poison, i32 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP36]], [4 x i32] poison, [6 x i32] [[TMP37]]), !continuation.registercount [[META19]]
-; CHECK-MAX-4-NEXT:    unreachable
-;
-; CHECK-MAX-8-LABEL: define void @AnyHit1DWords(
-; CHECK-MAX-8-SAME: i32 [[RETURNADDR:%.*]], [[STRUCT_ANYHITSYSTEMDATA:%.*]] [[TMP0:%.*]], [[STRUCT_ATTRIBUTES1DWORDS:%.*]] [[TMP1:%.*]], [7 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META22:![0-9]+]] !continuation.registercount [[META20:![0-9]+]] !continuation [[META23:![0-9]+]] {
-; CHECK-MAX-8-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
-; CHECK-MAX-8-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
-; CHECK-MAX-8-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITSYSTEMDATA]], align 8
-; CHECK-MAX-8-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4
-; CHECK-MAX-8-NEXT:    [[TMP5:%.*]] = alloca [[STRUCT_MYPAYLOAD:%.*]], align 8
-; CHECK-MAX-8-NEXT:    [[ORIGHITATTRS:%.*]] = alloca [8 x i32], align 4
-; CHECK-MAX-8-NEXT:    [[HITATTRSALLOCA:%.*]] = alloca [[STRUCT_ATTRIBUTES1DWORDS]], align 8
-; CHECK-MAX-8-NEXT:    store [10 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    store [[STRUCT_ANYHITSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
-; CHECK-MAX-8-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP5]], i32 0
-; CHECK-MAX-8-NEXT:    [[TMP8:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP8]], ptr [[TMP7]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; CHECK-MAX-8-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP11]], ptr [[TMP10]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP14]], ptr [[TMP12]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 2
-; CHECK-MAX-8-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 2
-; CHECK-MAX-8-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP17]], ptr [[TMP15]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; CHECK-MAX-8-NEXT:    [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP18]])
-; CHECK-MAX-8-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP19]], ptr [[TMP4]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN:%.*]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1
-; CHECK-MAX-8-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP4]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP21]], ptr [[ORIGHITATTRS]], align 4
-; CHECK-MAX-8-NEXT:    store [[STRUCT_ATTRIBUTES1DWORDS]] [[TMP1]], ptr [[HITATTRSALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
-; CHECK-MAX-8-NEXT:    call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]])
-; CHECK-MAX-8-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP5]], i32 0
-; CHECK-MAX-8-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP23]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; CHECK-MAX-8-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP26]], ptr [[TMP24]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP29]], ptr [[TMP27]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 2
-; CHECK-MAX-8-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 2
-; CHECK-MAX-8-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP32]], ptr [[TMP30]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP33:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP33]], ptr [[TMP3]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP34:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP3]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; CHECK-MAX-8-NEXT:    call void @_cont_SetTriangleHitAttributes(ptr [[TMP35]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP34]])
-; CHECK-MAX-8-NEXT:    [[TMP36:%.*]] = load [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP37:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, i32 poison, i32 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP36]], [8 x i32] poison, [10 x i32] [[TMP37]]), !continuation.registercount [[META20]]
-; CHECK-MAX-8-NEXT:    unreachable
-;
-  ret void
-}
-
-define void @AnyHit2DWords(%struct.MyPayload* %payload, %struct.Attributes2DWords* %attrs) !pointeetys !23 {
-; CHECK-MAX-1-LABEL: define void @AnyHit2DWords(
-; CHECK-MAX-1-SAME: ptr [[PAYLOAD:%.*]], ptr [[ATTRS:%.*]]) {
-; CHECK-MAX-1-NEXT:    ret void
-;
-; CHECK-MAX-2-LABEL: define void @AnyHit2DWords(
-; CHECK-MAX-2-SAME: i32 [[RETURNADDR:%.*]], [[STRUCT_ANYHITSYSTEMDATA:%.*]] [[TMP0:%.*]], [[STRUCT_ATTRIBUTES2DWORDS:%.*]] [[TMP1:%.*]], {} [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META20]] !continuation.registercount [[META18]] !continuation [[META22:![0-9]+]] {
-; CHECK-MAX-2-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
-; CHECK-MAX-2-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
-; CHECK-MAX-2-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITSYSTEMDATA]], align 8
-; CHECK-MAX-2-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [4 x i32], align 4
-; CHECK-MAX-2-NEXT:    [[TMP5:%.*]] = alloca [[STRUCT_MYPAYLOAD:%.*]], align 8
-; CHECK-MAX-2-NEXT:    [[ORIGHITATTRS:%.*]] = alloca [8 x i32], align 4
-; CHECK-MAX-2-NEXT:    [[HITATTRSALLOCA:%.*]] = alloca [[STRUCT_ATTRIBUTES2DWORDS]], align 8
-; CHECK-MAX-2-NEXT:    store [4 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-2-NEXT:    store [[STRUCT_ANYHITSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-MAX-2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
-; CHECK-MAX-2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP5]], i32 0
-; CHECK-MAX-2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-2-NEXT:    store i32 [[TMP8]], ptr [[TMP7]], align 4
-; CHECK-MAX-2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
-; CHECK-MAX-2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 1
-; CHECK-MAX-2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
-; CHECK-MAX-2-NEXT:    store i32 [[TMP11]], ptr [[TMP9]], align 4
-; CHECK-MAX-2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 2
-; CHECK-MAX-2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 2
-; CHECK-MAX-2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
-; CHECK-MAX-2-NEXT:    store i32 [[TMP14]], ptr [[TMP12]], align 4
-; CHECK-MAX-2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 3
-; CHECK-MAX-2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 3
-; CHECK-MAX-2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
-; CHECK-MAX-2-NEXT:    store i32 [[TMP17]], ptr [[TMP15]], align 4
-; CHECK-MAX-2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; CHECK-MAX-2-NEXT:    [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES]](ptr [[TMP18]])
-; CHECK-MAX-2-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP19]], ptr [[TMP4]], align 4
-; CHECK-MAX-2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP4]], align 4
-; CHECK-MAX-2-NEXT:    store i32 [[TMP20]], ptr [[ORIGHITATTRS]], align 4
-; CHECK-MAX-2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
-; CHECK-MAX-2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
-; CHECK-MAX-2-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
-; CHECK-MAX-2-NEXT:    store i32 [[TMP23]], ptr [[TMP21]], align 4
-; CHECK-MAX-2-NEXT:    store [[STRUCT_ATTRIBUTES2DWORDS]] [[TMP1]], ptr [[HITATTRSALLOCA]], align 4
-; CHECK-MAX-2-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
-; CHECK-MAX-2-NEXT:    call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]])
-; CHECK-MAX-2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP5]], i32 0
-; CHECK-MAX-2-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4
-; CHECK-MAX-2-NEXT:    store i32 [[TMP25]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 1
-; CHECK-MAX-2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 1
-; CHECK-MAX-2-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
-; CHECK-MAX-2-NEXT:    store i32 [[TMP28]], ptr [[TMP26]], align 4
-; CHECK-MAX-2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 2
-; CHECK-MAX-2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 2
-; CHECK-MAX-2-NEXT:    [[TMP31:%.*]] = load i32, ptr [[TMP30]], align 4
-; CHECK-MAX-2-NEXT:    store i32 [[TMP31]], ptr [[TMP29]], align 4
-; CHECK-MAX-2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 3
-; CHECK-MAX-2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 3
-; CHECK-MAX-2-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP33]], align 4
-; CHECK-MAX-2-NEXT:    store i32 [[TMP34]], ptr [[TMP32]], align 4
-; CHECK-MAX-2-NEXT:    [[TMP35:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
-; CHECK-MAX-2-NEXT:    store i32 [[TMP35]], ptr [[TMP3]], align 4
-; CHECK-MAX-2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
-; CHECK-MAX-2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1
-; CHECK-MAX-2-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP36]], align 4
-; CHECK-MAX-2-NEXT:    store i32 [[TMP38]], ptr [[TMP37]], align 4
-; CHECK-MAX-2-NEXT:    [[TMP39:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP3]], align 4
-; CHECK-MAX-2-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; CHECK-MAX-2-NEXT:    call void @_cont_SetTriangleHitAttributes(ptr [[TMP40]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP39]])
-; CHECK-MAX-2-NEXT:    [[TMP41:%.*]] = load [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-MAX-2-NEXT:    [[TMP42:%.*]] = load [4 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-2-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, i32 poison, i32 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP41]], [2 x i32] poison, [4 x i32] [[TMP42]]), !continuation.registercount [[META18]]
-; CHECK-MAX-2-NEXT:    unreachable
-;
-; CHECK-MAX-4-LABEL: define void @AnyHit2DWords(
-; CHECK-MAX-4-SAME: i32 [[RETURNADDR:%.*]], [[STRUCT_ANYHITSYSTEMDATA:%.*]] [[TMP0:%.*]], [[STRUCT_ATTRIBUTES2DWORDS:%.*]] [[TMP1:%.*]], [2 x i32] [[PADDING:%.*]], [6 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META21]] !continuation.registercount [[META19]] !continuation [[META23:![0-9]+]] {
-; CHECK-MAX-4-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
-; CHECK-MAX-4-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
-; CHECK-MAX-4-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITSYSTEMDATA]], align 8
-; CHECK-MAX-4-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [6 x i32], align 4
-; CHECK-MAX-4-NEXT:    [[TMP5:%.*]] = alloca [[STRUCT_MYPAYLOAD:%.*]], align 8
-; CHECK-MAX-4-NEXT:    [[ORIGHITATTRS:%.*]] = alloca [8 x i32], align 4
-; CHECK-MAX-4-NEXT:    [[HITATTRSALLOCA:%.*]] = alloca [[STRUCT_ATTRIBUTES2DWORDS]], align 8
-; CHECK-MAX-4-NEXT:    store [6 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-4-NEXT:    store [[STRUCT_ANYHITSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
-; CHECK-MAX-4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP5]], i32 0
-; CHECK-MAX-4-NEXT:    [[TMP8:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP8]], ptr [[TMP7]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 3
-; CHECK-MAX-4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
-; CHECK-MAX-4-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP11]], ptr [[TMP10]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
-; CHECK-MAX-4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 1
-; CHECK-MAX-4-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP14]], ptr [[TMP12]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 2
-; CHECK-MAX-4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 2
-; CHECK-MAX-4-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP17]], ptr [[TMP15]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; CHECK-MAX-4-NEXT:    [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES]](ptr [[TMP18]])
-; CHECK-MAX-4-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP19]], ptr [[TMP4]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD_ATTR_MAX_4_I32S_LAYOUT_1_ANYHIT_IN:%.*]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1
-; CHECK-MAX-4-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP4]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP21]], ptr [[ORIGHITATTRS]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
-; CHECK-MAX-4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
-; CHECK-MAX-4-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP24]], ptr [[TMP22]], align 4
-; CHECK-MAX-4-NEXT:    store [[STRUCT_ATTRIBUTES2DWORDS]] [[TMP1]], ptr [[HITATTRSALLOCA]], align 4
-; CHECK-MAX-4-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
-; CHECK-MAX-4-NEXT:    call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]])
-; CHECK-MAX-4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP5]], i32 0
-; CHECK-MAX-4-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP26]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 3
-; CHECK-MAX-4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 1
-; CHECK-MAX-4-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP29]], ptr [[TMP27]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 1
-; CHECK-MAX-4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i32 1
-; CHECK-MAX-4-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP32]], ptr [[TMP30]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 2
-; CHECK-MAX-4-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i32 2
-; CHECK-MAX-4-NEXT:    [[TMP35:%.*]] = load i32, ptr [[TMP34]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP35]], ptr [[TMP33]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP36:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP36]], ptr [[TMP3]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
-; CHECK-MAX-4-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1
-; CHECK-MAX-4-NEXT:    [[TMP39:%.*]] = load i32, ptr [[TMP37]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP39]], ptr [[TMP38]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP40:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP3]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; CHECK-MAX-4-NEXT:    call void @_cont_SetTriangleHitAttributes(ptr [[TMP41]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP40]])
-; CHECK-MAX-4-NEXT:    [[TMP42:%.*]] = load [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP43:%.*]] = load [6 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-4-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, i32 poison, i32 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP42]], [4 x i32] poison, [6 x i32] [[TMP43]]), !continuation.registercount [[META19]]
-; CHECK-MAX-4-NEXT:    unreachable
-;
-; CHECK-MAX-8-LABEL: define void @AnyHit2DWords(
-; CHECK-MAX-8-SAME: i32 [[RETURNADDR:%.*]], [[STRUCT_ANYHITSYSTEMDATA:%.*]] [[TMP0:%.*]], [[STRUCT_ATTRIBUTES2DWORDS:%.*]] [[TMP1:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META22]] !continuation.registercount [[META20]] !continuation [[META24:![0-9]+]] {
-; CHECK-MAX-8-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
-; CHECK-MAX-8-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
-; CHECK-MAX-8-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITSYSTEMDATA]], align 8
-; CHECK-MAX-8-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4
-; CHECK-MAX-8-NEXT:    [[TMP5:%.*]] = alloca [[STRUCT_MYPAYLOAD:%.*]], align 8
-; CHECK-MAX-8-NEXT:    [[ORIGHITATTRS:%.*]] = alloca [8 x i32], align 4
-; CHECK-MAX-8-NEXT:    [[HITATTRSALLOCA:%.*]] = alloca [[STRUCT_ATTRIBUTES2DWORDS]], align 8
-; CHECK-MAX-8-NEXT:    store [10 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    store [[STRUCT_ANYHITSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
-; CHECK-MAX-8-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP5]], i32 0
-; CHECK-MAX-8-NEXT:    [[TMP8:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP8]], ptr [[TMP7]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; CHECK-MAX-8-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP11]], ptr [[TMP10]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP14]], ptr [[TMP12]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 2
-; CHECK-MAX-8-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 2
-; CHECK-MAX-8-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP17]], ptr [[TMP15]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; CHECK-MAX-8-NEXT:    [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES]](ptr [[TMP18]])
-; CHECK-MAX-8-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP19]], ptr [[TMP4]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN:%.*]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1
-; CHECK-MAX-8-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP4]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP21]], ptr [[ORIGHITATTRS]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP24]], ptr [[TMP22]], align 4
-; CHECK-MAX-8-NEXT:    store [[STRUCT_ATTRIBUTES2DWORDS]] [[TMP1]], ptr [[HITATTRSALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
-; CHECK-MAX-8-NEXT:    call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]])
-; CHECK-MAX-8-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP5]], i32 0
-; CHECK-MAX-8-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP26]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; CHECK-MAX-8-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP29]], ptr [[TMP27]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP32]], ptr [[TMP30]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 2
-; CHECK-MAX-8-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i32 2
-; CHECK-MAX-8-NEXT:    [[TMP35:%.*]] = load i32, ptr [[TMP34]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP35]], ptr [[TMP33]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP36:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP36]], ptr [[TMP3]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP39:%.*]] = load i32, ptr [[TMP37]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP39]], ptr [[TMP38]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP40:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP3]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; CHECK-MAX-8-NEXT:    call void @_cont_SetTriangleHitAttributes(ptr [[TMP41]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP40]])
-; CHECK-MAX-8-NEXT:    [[TMP42:%.*]] = load [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP43:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, i32 poison, i32 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP42]], [8 x i32] poison, [10 x i32] [[TMP43]]), !continuation.registercount [[META20]]
-; CHECK-MAX-8-NEXT:    unreachable
-;
-  ret void
-}
-
-define void @AnyHit4DWords(%struct.MyPayload* %payload, %struct.Attributes4DWords* %attrs) !pointeetys !28 {
-; CHECK-MAX-1-LABEL: define void @AnyHit4DWords(
-; CHECK-MAX-1-SAME: ptr [[PAYLOAD:%.*]], ptr [[ATTRS:%.*]]) {
-; CHECK-MAX-1-NEXT:    ret void
-;
-; CHECK-MAX-2-LABEL: define void @AnyHit4DWords(
-; CHECK-MAX-2-SAME: ptr [[PAYLOAD:%.*]], ptr [[ATTRS:%.*]]) {
-; CHECK-MAX-2-NEXT:    ret void
-;
-; CHECK-MAX-4-LABEL: define void @AnyHit4DWords(
-; CHECK-MAX-4-SAME: i32 [[RETURNADDR:%.*]], [[STRUCT_ANYHITSYSTEMDATA:%.*]] [[TMP0:%.*]], [[STRUCT_ATTRIBUTES4DWORDS:%.*]] [[TMP1:%.*]], {} [[PADDING:%.*]], [6 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META21]] !continuation.registercount [[META19]] !continuation [[META24:![0-9]+]] {
-; CHECK-MAX-4-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
-; CHECK-MAX-4-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
-; CHECK-MAX-4-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITSYSTEMDATA]], align 8
-; CHECK-MAX-4-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [6 x i32], align 4
-; CHECK-MAX-4-NEXT:    [[TMP5:%.*]] = alloca [[STRUCT_MYPAYLOAD:%.*]], align 8
-; CHECK-MAX-4-NEXT:    [[ORIGHITATTRS:%.*]] = alloca [8 x i32], align 4
-; CHECK-MAX-4-NEXT:    [[HITATTRSALLOCA:%.*]] = alloca [[STRUCT_ATTRIBUTES4DWORDS]], align 8
-; CHECK-MAX-4-NEXT:    store [6 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-4-NEXT:    store [[STRUCT_ANYHITSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
-; CHECK-MAX-4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP5]], i32 0
-; CHECK-MAX-4-NEXT:    [[TMP8:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP8]], ptr [[TMP7]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 3
-; CHECK-MAX-4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
-; CHECK-MAX-4-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP11]], ptr [[TMP10]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
-; CHECK-MAX-4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 1
-; CHECK-MAX-4-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP14]], ptr [[TMP12]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 2
-; CHECK-MAX-4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 2
-; CHECK-MAX-4-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP17]], ptr [[TMP15]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; CHECK-MAX-4-NEXT:    [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES]](ptr [[TMP18]])
-; CHECK-MAX-4-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP19]], ptr [[TMP4]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD_ATTR_MAX_4_I32S_LAYOUT_1_ANYHIT_IN:%.*]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1
-; CHECK-MAX-4-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP4]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP21]], ptr [[ORIGHITATTRS]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
-; CHECK-MAX-4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
-; CHECK-MAX-4-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP24]], ptr [[TMP22]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 2
-; CHECK-MAX-4-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP20]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP26]], ptr [[TMP25]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 3
-; CHECK-MAX-4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 1
-; CHECK-MAX-4-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP29]], ptr [[TMP27]], align 4
-; CHECK-MAX-4-NEXT:    store [[STRUCT_ATTRIBUTES4DWORDS]] [[TMP1]], ptr [[HITATTRSALLOCA]], align 4
-; CHECK-MAX-4-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
-; CHECK-MAX-4-NEXT:    call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]])
-; CHECK-MAX-4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP5]], i32 0
-; CHECK-MAX-4-NEXT:    [[TMP31:%.*]] = load i32, ptr [[TMP30]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP31]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 3
-; CHECK-MAX-4-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP30]], i32 1
-; CHECK-MAX-4-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP33]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP34]], ptr [[TMP32]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 1
-; CHECK-MAX-4-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP33]], i32 1
-; CHECK-MAX-4-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP36]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP37]], ptr [[TMP35]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 2
-; CHECK-MAX-4-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP33]], i32 2
-; CHECK-MAX-4-NEXT:    [[TMP40:%.*]] = load i32, ptr [[TMP39]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP40]], ptr [[TMP38]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD_ATTR_MAX_4_I32S_LAYOUT_2_ANYHIT_OUT_ACCEPT:%.*]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1
-; CHECK-MAX-4-NEXT:    [[TMP42:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP42]], ptr [[TMP3]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
-; CHECK-MAX-4-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1
-; CHECK-MAX-4-NEXT:    [[TMP45:%.*]] = load i32, ptr [[TMP43]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP45]], ptr [[TMP44]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 2
-; CHECK-MAX-4-NEXT:    [[TMP47:%.*]] = load i32, ptr [[TMP46]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP47]], ptr [[TMP41]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 3
-; CHECK-MAX-4-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i32, ptr [[TMP41]], i32 1
-; CHECK-MAX-4-NEXT:    [[TMP50:%.*]] = load i32, ptr [[TMP48]], align 4
-; CHECK-MAX-4-NEXT:    store i32 [[TMP50]], ptr [[TMP49]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP51:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP3]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; CHECK-MAX-4-NEXT:    call void @_cont_SetTriangleHitAttributes(ptr [[TMP52]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP51]])
-; CHECK-MAX-4-NEXT:    [[TMP53:%.*]] = load [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-MAX-4-NEXT:    [[TMP54:%.*]] = load [6 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-4-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, i32 poison, i32 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP53]], [4 x i32] poison, [6 x i32] [[TMP54]]), !continuation.registercount [[META19]]
-; CHECK-MAX-4-NEXT:    unreachable
-;
-; CHECK-MAX-8-LABEL: define void @AnyHit4DWords(
-; CHECK-MAX-8-SAME: i32 [[RETURNADDR:%.*]], [[STRUCT_ANYHITSYSTEMDATA:%.*]] [[TMP0:%.*]], [[STRUCT_ATTRIBUTES4DWORDS:%.*]] [[TMP1:%.*]], [4 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META22]] !continuation.registercount [[META20]] !continuation [[META25:![0-9]+]] {
-; CHECK-MAX-8-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
-; CHECK-MAX-8-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
-; CHECK-MAX-8-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITSYSTEMDATA]], align 8
-; CHECK-MAX-8-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4
-; CHECK-MAX-8-NEXT:    [[TMP5:%.*]] = alloca [[STRUCT_MYPAYLOAD:%.*]], align 8
-; CHECK-MAX-8-NEXT:    [[ORIGHITATTRS:%.*]] = alloca [8 x i32], align 4
-; CHECK-MAX-8-NEXT:    [[HITATTRSALLOCA:%.*]] = alloca [[STRUCT_ATTRIBUTES4DWORDS]], align 8
-; CHECK-MAX-8-NEXT:    store [10 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    store [[STRUCT_ANYHITSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
-; CHECK-MAX-8-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP5]], i32 0
-; CHECK-MAX-8-NEXT:    [[TMP8:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP8]], ptr [[TMP7]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; CHECK-MAX-8-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP11]], ptr [[TMP10]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP14]], ptr [[TMP12]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 2
-; CHECK-MAX-8-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 2
-; CHECK-MAX-8-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP17]], ptr [[TMP15]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; CHECK-MAX-8-NEXT:    [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES]](ptr [[TMP18]])
-; CHECK-MAX-8-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP19]], ptr [[TMP4]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN:%.*]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1
-; CHECK-MAX-8-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP4]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP21]], ptr [[ORIGHITATTRS]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP24]], ptr [[TMP22]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 2
-; CHECK-MAX-8-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP20]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP26]], ptr [[TMP25]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 3
-; CHECK-MAX-8-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP29]], ptr [[TMP27]], align 4
-; CHECK-MAX-8-NEXT:    store [[STRUCT_ATTRIBUTES4DWORDS]] [[TMP1]], ptr [[HITATTRSALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
-; CHECK-MAX-8-NEXT:    call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]])
-; CHECK-MAX-8-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP5]], i32 0
-; CHECK-MAX-8-NEXT:    [[TMP31:%.*]] = load i32, ptr [[TMP30]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP31]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; CHECK-MAX-8-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP30]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP33]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP34]], ptr [[TMP32]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP33]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP36]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP37]], ptr [[TMP35]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 2
-; CHECK-MAX-8-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP33]], i32 2
-; CHECK-MAX-8-NEXT:    [[TMP40:%.*]] = load i32, ptr [[TMP39]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP40]], ptr [[TMP38]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_2_ANYHIT_OUT_ACCEPT_PAYLOAD_ATTR_2_I32S:%.*]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1
-; CHECK-MAX-8-NEXT:    [[TMP42:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP42]], ptr [[TMP3]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP45:%.*]] = load i32, ptr [[TMP43]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP45]], ptr [[TMP44]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 2
-; CHECK-MAX-8-NEXT:    [[TMP47:%.*]] = load i32, ptr [[TMP46]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP47]], ptr [[TMP41]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 3
-; CHECK-MAX-8-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i32, ptr [[TMP41]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP50:%.*]] = load i32, ptr [[TMP48]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP50]], ptr [[TMP49]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP51:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP3]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; CHECK-MAX-8-NEXT:    call void @_cont_SetTriangleHitAttributes(ptr [[TMP52]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP51]])
-; CHECK-MAX-8-NEXT:    [[TMP53:%.*]] = load [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP54:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, i32 poison, i32 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP53]], [8 x i32] poison, [10 x i32] [[TMP54]]), !continuation.registercount [[META20]]
-; CHECK-MAX-8-NEXT:    unreachable
-;
-  ret void
-}
-
-define void @AnyHit8DWords(%struct.MyPayload* %payload, %struct.Attributes8DWords* %attrs) !pointeetys !63 {
-; CHECK-MAX-1-LABEL: define void @AnyHit8DWords(
-; CHECK-MAX-1-SAME: ptr [[PAYLOAD:%.*]], ptr [[ATTRS:%.*]]) {
-; CHECK-MAX-1-NEXT:    ret void
-;
-; CHECK-MAX-2-LABEL: define void @AnyHit8DWords(
-; CHECK-MAX-2-SAME: ptr [[PAYLOAD:%.*]], ptr [[ATTRS:%.*]]) {
-; CHECK-MAX-2-NEXT:    ret void
-;
-; CHECK-MAX-4-LABEL: define void @AnyHit8DWords(
-; CHECK-MAX-4-SAME: ptr [[PAYLOAD:%.*]], ptr [[ATTRS:%.*]]) {
-; CHECK-MAX-4-NEXT:    ret void
-;
-; CHECK-MAX-8-LABEL: define void @AnyHit8DWords(
-; CHECK-MAX-8-SAME: i32 [[RETURNADDR:%.*]], [[STRUCT_ANYHITSYSTEMDATA:%.*]] [[TMP0:%.*]], [[STRUCT_ATTRIBUTES8DWORDS:%.*]] [[TMP1:%.*]], {} [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META22]] !continuation.registercount [[META20]] !continuation [[META26:![0-9]+]] {
-; CHECK-MAX-8-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
-; CHECK-MAX-8-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
-; CHECK-MAX-8-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITSYSTEMDATA]], align 8
-; CHECK-MAX-8-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4
-; CHECK-MAX-8-NEXT:    [[TMP5:%.*]] = alloca [[STRUCT_MYPAYLOAD:%.*]], align 8
-; CHECK-MAX-8-NEXT:    [[ORIGHITATTRS:%.*]] = alloca [8 x i32], align 4
-; CHECK-MAX-8-NEXT:    [[HITATTRSALLOCA:%.*]] = alloca [[STRUCT_ATTRIBUTES8DWORDS]], align 8
-; CHECK-MAX-8-NEXT:    store [10 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    store [[STRUCT_ANYHITSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
-; CHECK-MAX-8-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP5]], i32 0
-; CHECK-MAX-8-NEXT:    [[TMP8:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP8]], ptr [[TMP7]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; CHECK-MAX-8-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP11]], ptr [[TMP10]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP14]], ptr [[TMP12]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 2
-; CHECK-MAX-8-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 2
-; CHECK-MAX-8-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP17]], ptr [[TMP15]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; CHECK-MAX-8-NEXT:    [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES]](ptr [[TMP18]])
-; CHECK-MAX-8-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP19]], ptr [[TMP4]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN:%.*]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1
-; CHECK-MAX-8-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP4]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP21]], ptr [[ORIGHITATTRS]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP24]], ptr [[TMP22]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 2
-; CHECK-MAX-8-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP20]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP26]], ptr [[TMP25]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 3
-; CHECK-MAX-8-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP29]], ptr [[TMP27]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 4
-; CHECK-MAX-8-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 2
-; CHECK-MAX-8-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP32]], ptr [[TMP30]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 5
-; CHECK-MAX-8-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 3
-; CHECK-MAX-8-NEXT:    [[TMP35:%.*]] = load i32, ptr [[TMP34]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP35]], ptr [[TMP33]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 6
-; CHECK-MAX-8-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 4
-; CHECK-MAX-8-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP37]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP38]], ptr [[TMP36]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 7
-; CHECK-MAX-8-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 5
-; CHECK-MAX-8-NEXT:    [[TMP41:%.*]] = load i32, ptr [[TMP40]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP41]], ptr [[TMP39]], align 4
-; CHECK-MAX-8-NEXT:    store [[STRUCT_ATTRIBUTES8DWORDS]] [[TMP1]], ptr [[HITATTRSALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
-; CHECK-MAX-8-NEXT:    call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]])
-; CHECK-MAX-8-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP5]], i32 0
-; CHECK-MAX-8-NEXT:    [[TMP43:%.*]] = load i32, ptr [[TMP42]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP43]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; CHECK-MAX-8-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[TMP42]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP46:%.*]] = load i32, ptr [[TMP45]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP46]], ptr [[TMP44]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i32, ptr [[TMP44]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, ptr [[TMP45]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP49:%.*]] = load i32, ptr [[TMP48]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP49]], ptr [[TMP47]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, ptr [[TMP44]], i32 2
-; CHECK-MAX-8-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, ptr [[TMP45]], i32 2
-; CHECK-MAX-8-NEXT:    [[TMP52:%.*]] = load i32, ptr [[TMP51]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP52]], ptr [[TMP50]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_2_ANYHIT_OUT_ACCEPT:%.*]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1
-; CHECK-MAX-8-NEXT:    [[TMP54:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP54]], ptr [[TMP3]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP57:%.*]] = load i32, ptr [[TMP55]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP57]], ptr [[TMP56]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 2
-; CHECK-MAX-8-NEXT:    [[TMP59:%.*]] = load i32, ptr [[TMP58]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP59]], ptr [[TMP53]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 3
-; CHECK-MAX-8-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, ptr [[TMP53]], i32 1
-; CHECK-MAX-8-NEXT:    [[TMP62:%.*]] = load i32, ptr [[TMP60]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP62]], ptr [[TMP61]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 4
-; CHECK-MAX-8-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i32, ptr [[TMP53]], i32 2
-; CHECK-MAX-8-NEXT:    [[TMP65:%.*]] = load i32, ptr [[TMP63]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP65]], ptr [[TMP64]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 5
-; CHECK-MAX-8-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i32, ptr [[TMP53]], i32 3
-; CHECK-MAX-8-NEXT:    [[TMP68:%.*]] = load i32, ptr [[TMP66]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP68]], ptr [[TMP67]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 6
-; CHECK-MAX-8-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, ptr [[TMP53]], i32 4
-; CHECK-MAX-8-NEXT:    [[TMP71:%.*]] = load i32, ptr [[TMP69]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP71]], ptr [[TMP70]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 7
-; CHECK-MAX-8-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, ptr [[TMP53]], i32 5
-; CHECK-MAX-8-NEXT:    [[TMP74:%.*]] = load i32, ptr [[TMP72]], align 4
-; CHECK-MAX-8-NEXT:    store i32 [[TMP74]], ptr [[TMP73]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP75:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP3]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; CHECK-MAX-8-NEXT:    call void @_cont_SetTriangleHitAttributes(ptr [[TMP76]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP75]])
-; CHECK-MAX-8-NEXT:    [[TMP77:%.*]] = load [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    [[TMP78:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, i32 poison, i32 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP77]], [8 x i32] poison, [10 x i32] [[TMP78]]), !continuation.registercount [[META20]]
-; CHECK-MAX-8-NEXT:    unreachable
-;
-  ret void
-}
-
-; Function Attrs: nounwind
-declare !pointeetys !30 void @dx.op.traceRay.struct.MyPayload(i32, %dx.types.Handle, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, %struct.MyPayload*) #0
-
-; Function Attrs: nounwind
-declare void @dx.op.textureStore.f32(i32, %dx.types.Handle, i32, i32, i32, float, float, float, float, i8) #0
-
-; Function Attrs: nounwind memory(none)
-declare i32 @dx.op.dispatchRaysIndex.i32(i32, i8) #1
-
-; Function Attrs: nounwind memory(none)
-declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
-
-; Function Attrs: nounwind memory(read)
-declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.types.Handle) #2
-
-; Function Attrs: alwaysinline
-declare %struct.DispatchSystemData @_AmdAwaitTraversal(i32, %struct.TraversalData) #3
-
-; Function Attrs: alwaysinline
-declare !pointeetys !31 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData*) #3
-
-; Function Attrs: alwaysinline
-declare !pointeetys !33 void @_cont_SetTriangleHitAttributes(%struct.SystemData*, %struct.BuiltInTriangleIntersectionAttributes) #3
-
-; Function Attrs: alwaysinline
-declare !pointeetys !34 i1 @_cont_IsEndSearch(%struct.TraversalData*) #3
-
-; Function Attrs: nounwind memory(read)
-declare !pointeetys !36 i32 @_cont_HitKind(%struct.SystemData* nocapture readnone, %struct.HitData*) #2
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !38 void @_AmdRestoreSystemData(%struct.DispatchSystemData*) #1
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !40 void @_AmdRestoreSystemDataAnyHit(%struct.AnyHitTraversalData*) #1
-
-declare !pointeetys !42 i1 @_cont_ReportHit(%struct.AnyHitSystemData*, float, i32)
-
-declare !pointeetys !44 void @_cont_AcceptHit(%struct.AnyHitSystemData*)
-
-declare !pointeetys !45 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
-
-; Function Attrs: alwaysinline
-define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) #3 !pointeetys !45 {
-; CHECK-LABEL: define i32 @_cont_GetLocalRootIndex(
-; CHECK-SAME: ptr [[DATA:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    ret i32 5
-;
-  ret i32 5
-}
-
-; Function Attrs: alwaysinline
-define void @_cont_TraceRay(%struct.DispatchSystemData* %data, i64 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, float %6, float %7, float %8, float %9, float %10, float %11, float %12, float %13) #3 !pointeetys !46 {
-  %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4
-  %sys_data = insertvalue %struct.SystemData undef, %struct.DispatchSystemData %dis_data, 0
-  %trav_data = insertvalue %struct.TraversalData undef, %struct.SystemData %sys_data, 0
-  %newdata = call %struct.DispatchSystemData @_AmdAwaitTraversal(i32 4, %struct.TraversalData %trav_data)
-  store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4
-  call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data)
-  ret void
-}
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare !pointeetys !47 void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #4
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare !pointeetys !47 void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #4
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind memory(none) }
-attributes #2 = { nounwind memory(read) }
-attributes #3 = { alwaysinline }
-attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
-
-!llvm.ident = !{!0}
-!dx.version = !{!1}
-!dx.valver = !{!1}
-!dx.shaderModel = !{!2}
-!dx.resources = !{!3}
-; DX entry points. We use grep filters on NOT-{maxSize} to only enable compatible shaders.
-!dx.entryPoints = !{
-  !10
-  , !14 ; AHS using 1 Dword attributes.
-  , !15 ; AHS using 2 Dword attributes.  NOT-1
-  , !16 ; AHS using 4 Dword attributes.  NOT-1 NOT-2
-  , !17 ; AHS using 8 Dword attributes.  NOT-1 NOT-2 NOT-4
-}
-
-; We filter out one of the following lines using a grep in the RUN line.
-; The NOT-{maxSize} patterns are used to run a test with the max hit attribute size to
-; maxSize, and only enabling compatible shaders.
-; The NOT-INVALID pattern is used to run all shaders with a max attribute size of 2 dwords,
-; which is expected to fail.
-!lgc.rt.max.attribute.size = !{!49} ; 1 DWord(s).       NOT-2 NOT-4 NOT-8 NOT-INVALID
-!lgc.rt.max.attribute.size = !{!22} ; 2 DWord(s). NOT-1       NOT-4 NOT-8
-!lgc.rt.max.attribute.size = !{!26} ; 4 DWord(s). NOT-1 NOT-2       NOT-8 NOT-INVALID
-!lgc.rt.max.attribute.size = !{!27} ; 8 DWord(s). NOT-1 NOT-2 NOT-4       NOT-INVALID
-
-!0 = !{!"dxcoob 2019.05.00"}
-!1 = !{i32 1, i32 7}
-!2 = !{!"lib", i32 6, i32 7}
-!3 = !{!4, !7, null, null}
-!4 = !{!5}
-!5 = !{i32 0, %struct.RaytracingAccelerationStructure* bitcast (%dx.types.Handle* @"\01?myAccelerationStructure@@3URaytracingAccelerationStructure@@A" to %struct.RaytracingAccelerationStructure*), !"myAccelerationStructure", i32 0, i32 3, i32 1, i32 16, i32 0, !6}
-!6 = !{i32 0, i32 4}
-!7 = !{!8}
-!8 = !{i32 0, %"class.RWTexture2D<vector<float, 4> >"* bitcast (%dx.types.Handle* @"\01?gOutput@@3V?$RWTexture2D@V?$vector@M$03@@@@A" to %"class.RWTexture2D<vector<float, 4> >"*), !"gOutput", i32 0, i32 0, i32 1, i32 2, i1 false, i1 false, i1 false, !9}
-!9 = !{i32 0, i32 9}
-!10 = !{null, !"", null, !3, !11}
-!11 = !{i32 0, i64 65540}
-!12 = !{i32 8, i32 9, i32 5, !13}
-!13 = !{i32 0}
-!14 = !{void (%struct.MyPayload*, %struct.Attributes1DWords*)* @AnyHit1DWords, !"AnyHit1DWords", null, null, !12}
-!15 = !{void (%struct.MyPayload*, %struct.Attributes4DWords*)* @AnyHit2DWords, !"AnyHit2DWords", null, null, !12}
-!16 = !{void (%struct.MyPayload*, %struct.Attributes4DWords*)* @AnyHit4DWords, !"AnyHit4DWords", null, null, !12}
-!17 = !{void (%struct.MyPayload*, %struct.Attributes4DWords*)* @AnyHit8DWords, !"AnyHit8DWords", null, null, !12}
-!22 = !{i32 8}
-!23 = !{null, %struct.MyPayload poison, %struct.Attributes2DWords poison}
-!24 = !{i32 0, %struct.MyPayload poison}
-!25 = !{i32 0, %struct.Attributes2DWords poison}
-!26 = !{i32 16}
-!27 = !{i32 32}
-!28 = !{null, %struct.MyPayload poison, %struct.Attributes4DWords poison}
-!29 = !{i32 0, %struct.Attributes4DWords poison}
-!30 = !{%struct.MyPayload poison}
-!31 = !{%struct.SystemData poison}
-!32 = !{i32 0, %struct.SystemData poison}
-!33 = !{%struct.SystemData poison}
-!34 = !{%struct.TraversalData poison}
-!35 = !{i32 0, %struct.TraversalData poison}
-!36 = !{null, %struct.SystemData poison, %struct.HitData poison}
-!37 = !{i32 0, %struct.HitData poison}
-!38 = !{%struct.DispatchSystemData poison}
-!39 = !{i32 0, %struct.DispatchSystemData poison}
-!40 = !{%struct.AnyHitTraversalData poison}
-!41 = !{i32 0, %struct.AnyHitTraversalData poison}
-!42 = !{%struct.AnyHitSystemData poison}
-!43 = !{i32 0, %struct.AnyHitSystemData poison}
-!44 = !{%struct.AnyHitSystemData poison}
-!45 = !{%struct.DispatchSystemData poison}
-!46 = !{%struct.DispatchSystemData poison}
-!47 = !{i8 poison}
-!48 = !{i32 0, i8 poison}
-!49 = !{i32 4}
-!60 = !{null, %struct.MyPayload poison, %struct.Attributes1DWords poison}
-!61 = !{i32 0, %struct.MyPayload poison}
-!62 = !{i32 0, %struct.Attributes1DWords poison}
-!63 = !{null, %struct.MyPayload poison, %struct.Attributes8DWords poison}
-!64 = !{i32 0, %struct.Attributes8DWords poison}
-;.
-; CHECK-MAX-1: [[META15]] = !{i32 4}
-; CHECK-MAX-1: [[META18]] = !{i32 2}
-; CHECK-MAX-1: [[META19]] = !{ptr @AnyHit1DWords}
-;.
-; CHECK-MAX-2: [[META18]] = !{i32 4}
-; CHECK-MAX-2: [[META20]] = !{i32 2}
-; CHECK-MAX-2: [[META21]] = !{ptr @AnyHit1DWords}
-; CHECK-MAX-2: [[META22]] = !{ptr @AnyHit2DWords}
-;.
-; CHECK-MAX-4: [[META19]] = !{i32 6}
-; CHECK-MAX-4: [[META21]] = !{i32 2}
-; CHECK-MAX-4: [[META22]] = !{ptr @AnyHit1DWords}
-; CHECK-MAX-4: [[META23]] = !{ptr @AnyHit2DWords}
-; CHECK-MAX-4: [[META24]] = !{ptr @AnyHit4DWords}
-;.
-; CHECK-MAX-8: [[META20]] = !{i32 10}
-; CHECK-MAX-8: [[META22]] = !{i32 2}
-; CHECK-MAX-8: [[META23]] = !{ptr @AnyHit1DWords}
-; CHECK-MAX-8: [[META24]] = !{ptr @AnyHit2DWords}
-; CHECK-MAX-8: [[META25]] = !{ptr @AnyHit4DWords}
-; CHECK-MAX-8: [[META26]] = !{ptr @AnyHit8DWords}
-;.
diff --git a/llvmraytracing/test/dx/payload-caller-in-paq.ll b/llvmraytracing/test/dx/payload-caller-in-paq.ll
deleted file mode 100644
index 794a3b69f3..0000000000
--- a/llvmraytracing/test/dx/payload-caller-in-paq.ll
+++ /dev/null
@@ -1,263 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function RayGen --version 3
-; RUN: opt --verify-each -passes='dxil-cont-prepare-gpurt-library,lint,dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,inline,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
-
-; Test the special case of payload import in the caller after TraceRay. Here, we cast the
-; payload storage both to the ClosestHitOut layout and the MissOut layout and import both,
-; skipping duplicate copies.
-; This file was generated by compiling payload_caller_in_paq.ll.hlsl
-; and manually adding some gpurt content at the end.
-; Also, '@"\01?RayGen@@YAXXZ"' was renamed to '@RayGen' to match update_test_checks.py's
-; function name regex.
-; Note that the payload has nontrivial payload access qualifiers set.
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-%dx.types.Handle = type { i8* }
-%struct.MyPayload = type { float, i32, double }
-%dx.types.ResourceProperties = type { i32, i32 }
-%struct.DispatchSystemData = type { i32 }
-%struct.TraversalData = type { %struct.SystemData, i32 }
-%struct.SystemData = type { %struct.DispatchSystemData }
-%struct.AnyHitTraversalData = type { %struct.TraversalData, %struct.HitData }
-%struct.HitData = type { float, i32 }
-%struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
-%struct.RaytracingAccelerationStructure = type { i32 }
-%"class.RWTexture2D<vector<float, 4> >" = type { <4 x float> }
-
-@"\01?myAccelerationStructure@@3URaytracingAccelerationStructure@@A" = external constant %dx.types.Handle, align 4
-@"\01?gOutput@@3V?$RWTexture2D@V?$vector@M$03@@@@A" = external constant %dx.types.Handle, align 4
-
-define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !pointeetys !{%struct.DispatchSystemData poison} {
-  ret void
-}
-
-; Function Attrs: nounwind
-define void @RayGen() #0 {
-; LOWERRAYTRACINGPIPELINE-LABEL: define void @RayGen(
-; LOWERRAYTRACINGPIPELINE-SAME: i32 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] !continuation [[META27:![0-9]+]] !lgc.rt.shaderstage [[META23:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation.registercount [[META23]] {
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [7 x i32], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?myAccelerationStructure@@3URaytracingAccelerationStructure@@A", align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?gOutput@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_MYPAYLOAD:%.*]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = bitcast ptr [[TMP4]] to ptr
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP5]]) #[[ATTR0]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float 1.000000e+00, ptr [[TMP6]], align 8, !tbaa [[TBAA28:![0-9]+]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP7]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP8]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[DIS_DATA_I:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[ADDR_I:%.*]] = call i32 @_AmdGetResumePointAddr() #[[ATTR3:[0-9]+]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i32 [[ADDR_I]], 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP10]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP33]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [11 x i32], [3 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa11i32a3i32s(i32 4, i32 8, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [1 x i32] [[TMP12]]), !waitmask [[META13]], !continuation.registercount [[META32:![0-9]+]], !continuation.returnedRegistercount [[META25:![0-9]+]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [11 x i32], [3 x i32] } [[TMP17]], 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [3 x i32] [[TMP15]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = freeze [[STRUCT_MYPAYLOAD]] poison
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_MYPAYLOAD]] [[TMP27]], ptr [[TMP4]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP19]], ptr [[TMP14]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP4]], i32 0, i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP24]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP38]], ptr [[TMP16]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP4]], i32 0, i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP26]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP21]], ptr [[TMP18]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP25]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP23]], ptr [[TMP22]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [11 x i32], [3 x i32] } [[TMP17]], 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP13]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
-; LOWERRAYTRACINGPIPELINE-NEXT:    br label [[DOTSPLIT:%.*]]
-; LOWERRAYTRACINGPIPELINE:       .split:
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP42:%.*]] = load float, ptr [[TMP6]], align 8, !tbaa [[TBAA28]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP4]], i32 0, i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP44:%.*]] = load i32, ptr [[TMP43]], align 4, !tbaa [[TBAA33:![0-9]+]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP45:%.*]] = sitofp i32 [[TMP44]] to float
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP4]], i32 0, i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP47:%.*]] = load double, ptr [[TMP46]], align 8, !tbaa [[TBAA35:![0-9]+]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP48:%.*]] = fptrunc double [[TMP47]] to float
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP49:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP49]], i8 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP50:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP50]], i8 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP37:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP52:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP37]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 })
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @dx.op.textureStore.f32(i32 67, [[DX_TYPES_HANDLE]] [[TMP52]], i32 [[EXTRACT]], i32 [[EXTRACT1]], i32 undef, float [[TMP42]], float [[TMP45]], float [[TMP48]], float 0.000000e+00, i8 15)
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[TMP5]]) #[[ATTR0]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @lgc.cps.complete()
-; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
-;
-  %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?myAccelerationStructure@@3URaytracingAccelerationStructure@@A", align 4
-  %2 = load %dx.types.Handle, %dx.types.Handle* @"\01?gOutput@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
-  %3 = alloca %struct.MyPayload, align 8
-  %4 = bitcast %struct.MyPayload* %3 to i8*
-  call void @llvm.lifetime.start.p0i8(i64 16, i8* %4) #0
-  %5 = getelementptr inbounds %struct.MyPayload, %struct.MyPayload* %3, i32 0, i32 0
-  store float 1.000000e+00, float* %5, align 8, !tbaa !24
-  %6 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1)
-  %7 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %6, %dx.types.ResourceProperties { i32 16, i32 0 })
-  call void @dx.op.traceRay.struct.MyPayload(i32 157, %dx.types.Handle %7, i32 0, i32 0, i32 0, i32 0, i32 0, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, %struct.MyPayload* nonnull %3)
-  %8 = load float, float* %5, align 8, !tbaa !24
-  %9 = getelementptr inbounds %struct.MyPayload, %struct.MyPayload* %3, i32 0, i32 1
-  %10 = load i32, i32* %9, align 4, !tbaa !28
-  %11 = sitofp i32 %10 to float
-  %12 = getelementptr inbounds %struct.MyPayload, %struct.MyPayload* %3, i32 0, i32 2
-  %13 = load double, double* %12, align 8, !tbaa !30
-  %14 = fptrunc double %13 to float
-  %15 = call i32 @dx.op.dispatchRaysIndex.i32(i32 145, i8 0)
-  %16 = call i32 @dx.op.dispatchRaysIndex.i32(i32 145, i8 1)
-  %17 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %2)
-  %18 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %17, %dx.types.ResourceProperties { i32 4098, i32 1033 })
-  call void @dx.op.textureStore.f32(i32 67, %dx.types.Handle %18, i32 %15, i32 %16, i32 undef, float %8, float %11, float %14, float 0.000000e+00, i8 15)
-  call void @llvm.lifetime.end.p0i8(i64 16, i8* %4) #0
-  ret void
-}
-
-; Function Attrs: nounwind
-declare !pointeetys !32 void @dx.op.traceRay.struct.MyPayload(i32, %dx.types.Handle, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, %struct.MyPayload*) #0
-
-; Function Attrs: nounwind
-declare void @dx.op.textureStore.f32(i32, %dx.types.Handle, i32, i32, i32, float, float, float, float, i8) #0
-
-; Function Attrs: nounwind memory(none)
-declare i32 @dx.op.dispatchRaysIndex.i32(i32, i8) #1
-
-; Function Attrs: nounwind memory(none)
-declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
-
-; Function Attrs: nounwind memory(read)
-declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.types.Handle) #2
-
-; Function Attrs: alwaysinline
-declare %struct.DispatchSystemData @_AmdWaitAwaitTraversal(i32, i64, %struct.TraversalData) #3
-
-; Function Attrs: alwaysinline
-declare !pointeetys !34 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData*) #3
-
-; Function Attrs: nounwind memory(read)
-declare !pointeetys !36 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData* nocapture readnone) #2
-
-; Function Attrs: alwaysinline
-declare !pointeetys !38 void @_cont_SetTriangleHitAttributes(%struct.SystemData*, %struct.BuiltInTriangleIntersectionAttributes) #3
-
-; Function Attrs: alwaysinline
-declare !pointeetys !39 i1 @_cont_IsEndSearch(%struct.TraversalData*) #3
-
-; Function Attrs: nounwind memory(read)
-declare !pointeetys !41 i32 @_cont_HitKind(%struct.SystemData* nocapture readnone, %struct.HitData*) #2
-
-declare !pointeetys !50 i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind)
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !43 void @_AmdRestoreSystemData(%struct.DispatchSystemData*) #1
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !44 void @_AmdRestoreSystemDataAnyHit(%struct.AnyHitTraversalData*) #1
-
-; Function Attrs: nounwind
-declare i32 @_AmdGetResumePointAddr() #3
-
-; Function Attrs: alwaysinline
-define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) #3 !pointeetys !46 {
-  ret i32 5
-}
-
-; Function Attrs: alwaysinline
-define void @_cont_TraceRay(%struct.DispatchSystemData* %data, i64 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, float %6, float %7, float %8, float %9, float %10, float %11, float %12, float %13) #3 !pointeetys !47 {
-  %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4
-  %sys_data = insertvalue %struct.SystemData undef, %struct.DispatchSystemData %dis_data, 0
-  %trav_data = insertvalue %struct.TraversalData undef, %struct.SystemData %sys_data, 0
-  %addr = call i32 @_AmdGetResumePointAddr() #3
-  %trav_data2 = insertvalue %struct.TraversalData %trav_data, i32 %addr, 1
-  %newdata = call %struct.DispatchSystemData @_AmdWaitAwaitTraversal(i32 4, i64 -1, %struct.TraversalData %trav_data2)
-  store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4
-  call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data)
-  ret void
-}
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare !pointeetys !48 void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #4
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare !pointeetys !48 void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #4
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind memory(none) }
-attributes #2 = { nounwind memory(read) }
-attributes #3 = { alwaysinline }
-attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
-
-!llvm.ident = !{!0}
-!dx.version = !{!1}
-!dx.valver = !{!1}
-!dx.shaderModel = !{!2}
-!dx.resources = !{!3}
-!dx.typeAnnotations = !{!10}
-!dx.dxrPayloadAnnotations = !{!14}
-!dx.entryPoints = !{!19, !21}
-
-!0 = !{!"dxcoob 2019.05.00"}
-!1 = !{i32 1, i32 7}
-!2 = !{!"lib", i32 6, i32 7}
-!3 = !{!4, !7, null, null}
-!4 = !{!5}
-!5 = !{i32 0, %struct.RaytracingAccelerationStructure* bitcast (%dx.types.Handle* @"\01?myAccelerationStructure@@3URaytracingAccelerationStructure@@A" to %struct.RaytracingAccelerationStructure*), !"myAccelerationStructure", i32 0, i32 3, i32 1, i32 16, i32 0, !6}
-!6 = !{i32 0, i32 4}
-!7 = !{!8}
-!8 = !{i32 0, %"class.RWTexture2D<vector<float, 4> >"* bitcast (%dx.types.Handle* @"\01?gOutput@@3V?$RWTexture2D@V?$vector@M$03@@@@A" to %"class.RWTexture2D<vector<float, 4> >"*), !"gOutput", i32 0, i32 0, i32 1, i32 2, i1 false, i1 false, i1 false, !9}
-!9 = !{i32 0, i32 9}
-!10 = !{i32 1, void ()* @RayGen, !11}
-!11 = !{!12}
-!12 = !{i32 1, !13, !13}
-!13 = !{}
-!14 = !{i32 0, %struct.MyPayload undef, !15}
-!15 = !{!16, !17, !18}
-!16 = !{i32 0, i32 3}
-!17 = !{i32 0, i32 33}
-!18 = !{i32 0, i32 513}
-!19 = !{null, !"", null, !3, !20}
-!20 = !{i32 0, i64 65540}
-!21 = !{void ()* @RayGen, !"RayGen", null, null, !22}
-!22 = !{i32 8, i32 7, i32 5, !23}
-!23 = !{i32 0}
-!24 = !{!25, !25, i64 0}
-!25 = !{!"float", !26, i64 0}
-!26 = !{!"omnipotent char", !27, i64 0}
-!27 = !{!"Simple C/C++ TBAA"}
-!28 = !{!29, !29, i64 0}
-!29 = !{!"int", !26, i64 0}
-!30 = !{!31, !31, i64 0}
-!31 = !{!"double", !26, i64 0}
-!32 = !{%struct.MyPayload poison}
-!33 = !{i32 0, %struct.MyPayload poison}
-!34 = !{%struct.SystemData poison}
-!35 = !{i32 0, %struct.SystemData poison}
-!36 = !{%struct.DispatchSystemData poison}
-!37 = !{i32 0, %struct.DispatchSystemData poison}
-!38 = !{%struct.SystemData poison}
-!39 = !{%struct.TraversalData poison}
-!40 = !{i32 0, %struct.TraversalData poison}
-!41 = !{null, %struct.SystemData poison, %struct.HitData poison}
-!42 = !{i32 0, %struct.HitData poison}
-!43 = !{%struct.DispatchSystemData poison}
-!44 = !{%struct.AnyHitTraversalData poison}
-!45 = !{i32 0, %struct.AnyHitTraversalData poison}
-!46 = !{%struct.DispatchSystemData poison}
-!47 = !{%struct.DispatchSystemData poison}
-!48 = !{i8 poison}
-!49 = !{i32 0, i8 poison}
-!50 = !{%struct.AnyHitTraversalData poison}
diff --git a/llvmraytracing/test/dx/payload-caller-in-paq.ll.hlsl b/llvmraytracing/test/dx/payload-caller-in-paq.ll.hlsl
deleted file mode 100644
index 55c61edaf0..0000000000
--- a/llvmraytracing/test/dx/payload-caller-in-paq.ll.hlsl
+++ /dev/null
@@ -1,22 +0,0 @@
-// This file is not a test, rather it was used to generate
-// payload_caller_in_paq.ll and is kept so the .ll file can be re-generated.
-
-struct[raypayload] MyPayload {
-  float v1 : write(caller) : read(caller);
-  int v2 : write(closesthit) : read(caller);
-  double v3 : write(miss) : read(caller);
-};
-
-RaytracingAccelerationStructure myAccelerationStructure : register(t3);
-RWTexture2D<float4> gOutput : register(u0);
-
-[shader("raygeneration")] void RayGen() {
-  MyPayload payload;
-  payload.v1 = 1.0;
-
-  RayDesc myRay = {float3(0., 0., 0.), 0., float3(0., 0., 0.), 1.0};
-
-  TraceRay(myAccelerationStructure, 0, 0, 0, 0, 0, myRay, payload);
-
-  gOutput[DispatchRaysIndex().xy] = float4(payload.v1, payload.v2, payload.v3, 0.);
-}
diff --git a/llvmraytracing/test/dx/payload-save-registers.ll b/llvmraytracing/test/dx/payload-save-registers.ll
deleted file mode 100644
index c937e8cf95..0000000000
--- a/llvmraytracing/test/dx/payload-save-registers.ll
+++ /dev/null
@@ -1,1285 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
-
-; Test that we correctly save and restore registers before/after recursive
-; TraceRay or CallShader if there are live values in payload registers that
-; are not overwritten at the end of the shader, e.g. the payload memory pointer.
-; This applies to ClosestHit and Miss.
-; This file was generated by compiling payload_save_registers.ll.hlsl
-; and manually adding some gpurt content at the end.
-; Also, function name mangling was removed.
-; Note that the payload has payload access qualifiers set.
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-%dx.types.Handle = type { i8* }
-%struct.OuterPayload = type { [15 x float], [15 x float] }
-%struct.InnerPayload = type { float }
-%dx.types.ResourceProperties = type { i32, i32 }
-%struct.DispatchSystemData = type { i32 }
-%struct.TraversalData = type { %struct.SystemData }
-%struct.SystemData = type { %struct.DispatchSystemData }
-%struct.AnyHitTraversalData = type { %struct.TraversalData, %struct.HitData }
-%struct.HitData = type { float, i32 }
-%struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
-%struct.RaytracingAccelerationStructure = type { i32 }
-
-declare !pointeetys !48 i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind)
-
-@"\01?myAccelerationStructure@@3URaytracingAccelerationStructure@@A" = external constant %dx.types.Handle, align 4
-
-; Function Attrs: nounwind
-define void @Miss(%struct.OuterPayload* noalias nocapture %outerPayload) #0 !pointeetys !23 {
-; LOWERRAYTRACINGPIPELINE-LABEL: define void @Miss(
-; LOWERRAYTRACINGPIPELINE-SAME: i32 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [4 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR0:[0-9]+]] !lgc.rt.shaderstage [[META26:![0-9]+]] !continuation.registercount [[META24:![0-9]+]] !continuation [[META27:![0-9]+]] {
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [30 x i32], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_OUTERPAYLOAD:%.*]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [30 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_SYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP5]], ptr [[TMP4]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP9]], ptr [[TMP8]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP15]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP11]], ptr [[TMP10]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 3
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 3
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP13]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP21]], ptr [[TMP12]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP27]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP17]], ptr [[TMP14]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 5
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 5
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP19]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP33]], ptr [[TMP16]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 6
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 6
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP36]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP23]], ptr [[TMP18]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP64:%.*]] = load i32, ptr [[TMP25]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP64]], ptr [[TMP20]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP66]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP29]], ptr [[TMP22]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 9
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 9
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP68:%.*]] = load i32, ptr [[TMP31]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP68]], ptr [[TMP24]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 10
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 10
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = load i32, ptr [[TMP70]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP35]], ptr [[TMP26]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 11
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 11
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP37]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP38]], ptr [[TMP28]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 12
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 12
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP41:%.*]] = load i32, ptr [[TMP72]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP41]], ptr [[TMP30]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 13
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 13
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP44:%.*]] = load i32, ptr [[TMP43]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP44]], ptr [[TMP32]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 14
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP74:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 14
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP76:%.*]] = load i32, ptr [[TMP74]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP76]], ptr [[TMP34]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP78:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 15
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP80:%.*]] = load i32, ptr [[TMP78]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP81:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 16
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP51:%.*]] = load i32, ptr [[TMP81]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP83:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 17
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP86:%.*]] = load i32, ptr [[TMP83]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP90:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 18
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP96:%.*]] = load i32, ptr [[TMP90]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP56:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 19
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP105:%.*]] = load i32, ptr [[TMP56]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP107:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 20
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP108:%.*]] = load i32, ptr [[TMP107]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP60:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 21
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP110:%.*]] = load i32, ptr [[TMP60]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP82:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 22
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP111:%.*]] = load i32, ptr [[TMP82]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP85:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 23
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP113:%.*]] = load i32, ptr [[TMP85]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP55:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 24
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP67:%.*]] = load i32, ptr [[TMP55]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP92:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 25
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP69:%.*]] = load i32, ptr [[TMP92]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP141:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 26
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP71:%.*]] = load i32, ptr [[TMP141]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP40:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 27
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP73:%.*]] = load i32, ptr [[TMP40]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP63:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 28
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP75:%.*]] = load i32, ptr [[TMP63]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP65:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 29
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP77:%.*]] = load i32, ptr [[TMP65]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP45:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?myAccelerationStructure@@3URaytracingAccelerationStructure@@A", align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP46:%.*]] = alloca [[STRUCT_INNERPAYLOAD:%.*]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP47:%.*]] = bitcast ptr [[TMP46]] to ptr
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[TMP47]]) #[[ATTR0]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 0, i32 14
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP116:%.*]] = load float, ptr [[TMP48]], align 4, !tbaa [[TBAA28:![0-9]+]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT_INNERPAYLOAD]], ptr [[TMP46]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP116]], ptr [[TMP50]], align 4, !tbaa [[TBAA28]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP84:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP45]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP52:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP84]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP53:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP52]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[DIS_DATA_I:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP54]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_INNERPAYLOAD]], ptr [[TMP46]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP119:%.*]] = load i32, ptr [[TMP57]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP119]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP79:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP125:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [4 x i32], [1 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa4i32a1i32s(i32 4, i32 8, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], [4 x i32] poison, [1 x i32] [[TMP79]]), !continuation.registercount [[META32:![0-9]+]], !continuation.returnedRegistercount [[META32]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP61:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [4 x i32], [1 x i32] } [[TMP125]], 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [1 x i32] [[TMP61]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP117:%.*]] = freeze [[STRUCT_INNERPAYLOAD]] poison
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_INNERPAYLOAD]] [[TMP117]], ptr [[TMP46]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT_INNERPAYLOAD]], ptr [[TMP46]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP122:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP122]], ptr [[TMP59]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP58:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [4 x i32], [1 x i32] } [[TMP125]], 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP58]], ptr [[TMP54]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
-; LOWERRAYTRACINGPIPELINE-NEXT:    br label [[DOTSPLIT:%.*]]
-; LOWERRAYTRACINGPIPELINE:       .split:
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP97:%.*]] = load float, ptr [[TMP50]], align 4, !tbaa [[TBAA28]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP97]], ptr [[TMP48]], align 4, !tbaa [[TBAA28]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP47]]) #[[ATTR0]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP98:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 15
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP80]], ptr [[TMP98]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP99:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 16
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP51]], ptr [[TMP99]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP131:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 17
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP86]], ptr [[TMP131]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP101:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 18
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP96]], ptr [[TMP101]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP102:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 19
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP105]], ptr [[TMP102]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP134:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 20
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP108]], ptr [[TMP134]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP104:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 21
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP110]], ptr [[TMP104]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP87:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 22
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP111]], ptr [[TMP87]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP88:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 23
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP113]], ptr [[TMP88]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP89:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 24
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP67]], ptr [[TMP89]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP39:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 25
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP69]], ptr [[TMP39]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP91:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 26
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP71]], ptr [[TMP91]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP42:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 27
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP73]], ptr [[TMP42]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP93:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 28
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP75]], ptr [[TMP93]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP49:%.*]] = getelementptr i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 29
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP77]], ptr [[TMP49]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP114:%.*]] = load i32, ptr [[TMP62]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP114]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP137:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP100:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP128:%.*]] = load i32, ptr [[TMP100]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP128]], ptr [[TMP137]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP140:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP103:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP120:%.*]] = load i32, ptr [[TMP103]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP120]], ptr [[TMP140]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP146:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 3
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP106:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 3
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP123:%.*]] = load i32, ptr [[TMP106]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP123]], ptr [[TMP146]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP149:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP109:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP126:%.*]] = load i32, ptr [[TMP109]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP126]], ptr [[TMP149]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP152:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 5
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP112:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 5
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP129:%.*]] = load i32, ptr [[TMP112]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP129]], ptr [[TMP152]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP155:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 6
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP115:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 6
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP132:%.*]] = load i32, ptr [[TMP115]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP132]], ptr [[TMP155]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP157:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP118:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP135:%.*]] = load i32, ptr [[TMP118]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP135]], ptr [[TMP157]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP158:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP121:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP138:%.*]] = load i32, ptr [[TMP121]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP138]], ptr [[TMP158]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP159:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 9
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP124:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 9
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP160:%.*]] = load i32, ptr [[TMP124]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP160]], ptr [[TMP159]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP142:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 10
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP127:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 10
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP144:%.*]] = load i32, ptr [[TMP127]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP144]], ptr [[TMP142]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP145:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 11
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP130:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 11
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP147:%.*]] = load i32, ptr [[TMP130]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP147]], ptr [[TMP145]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP148:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 12
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP133:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 12
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP150:%.*]] = load i32, ptr [[TMP133]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP150]], ptr [[TMP148]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP151:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 13
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP136:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 13
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP153:%.*]] = load i32, ptr [[TMP136]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP153]], ptr [[TMP151]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP154:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 14
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP139:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 14
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP156:%.*]] = load i32, ptr [[TMP139]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP156]], ptr [[TMP154]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP95:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP94]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP143:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP95]], [4 x i32] poison, [30 x i32] [[TMP143]]), !continuation.registercount [[META24]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
-;
-  %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?myAccelerationStructure@@3URaytracingAccelerationStructure@@A", align 4
-  %2 = alloca %struct.InnerPayload, align 4
-  %3 = bitcast %struct.InnerPayload* %2 to i8*
-  call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #0
-  %4 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %outerPayload, i32 0, i32 0, i32 14
-  %5 = load float, float* %4, align 4, !tbaa !25
-  %6 = getelementptr inbounds %struct.InnerPayload, %struct.InnerPayload* %2, i32 0, i32 0
-  store float %5, float* %6, align 4, !tbaa !25
-  %7 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1)
-  %8 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %7, %dx.types.ResourceProperties { i32 16, i32 0 })
-  call void @dx.op.traceRay.struct.InnerPayload(i32 157, %dx.types.Handle %8, i32 0, i32 0, i32 0, i32 0, i32 0, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, %struct.InnerPayload* nonnull %2)
-  %9 = load float, float* %6, align 4, !tbaa !25
-  store float %9, float* %4, align 4, !tbaa !25
-  call void @llvm.lifetime.end.p0i8(i64 4, i8* %3) #0
-  ret void
-}
-
-; Function Attrs: nounwind
-define void @Callable(%struct.OuterPayload* noalias %outerPayload) #0 !pointeetys !23 {
-; LOWERRAYTRACINGPIPELINE-LABEL: define void @Callable(
-; LOWERRAYTRACINGPIPELINE-SAME: i32 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [4 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR0]] !lgc.rt.shaderstage [[META33:![0-9]+]] !continuation.registercount [[META24]] !continuation [[META34:![0-9]+]] {
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_OUTERPAYLOAD:%.*]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [30 x i32], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_OUTERPAYLOAD]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [30 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP5]], ptr [[TMP4]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP9]], ptr [[TMP6]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP15]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP11]], ptr [[TMP8]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 3
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 3
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP13]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP21]], ptr [[TMP10]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP27]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP17]], ptr [[TMP12]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 5
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 5
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP19]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP33]], ptr [[TMP14]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 6
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 6
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP39]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP23]], ptr [[TMP16]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP45:%.*]] = load i32, ptr [[TMP25]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP45]], ptr [[TMP18]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP51]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP29]], ptr [[TMP20]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 9
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 9
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP57:%.*]] = load i32, ptr [[TMP31]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP57]], ptr [[TMP22]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 10
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 10
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = load i32, ptr [[TMP63]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP35]], ptr [[TMP24]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 11
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 11
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP66:%.*]] = load i32, ptr [[TMP37]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP66]], ptr [[TMP26]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 12
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 12
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP41:%.*]] = load i32, ptr [[TMP69]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP41]], ptr [[TMP28]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 13
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 13
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP72:%.*]] = load i32, ptr [[TMP43]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP72]], ptr [[TMP30]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 14
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 14
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP47:%.*]] = load i32, ptr [[TMP75]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP47]], ptr [[TMP32]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 15
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 15
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP78:%.*]] = load i32, ptr [[TMP49]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP78]], ptr [[TMP34]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 16
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP81:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 16
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP53:%.*]] = load i32, ptr [[TMP81]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP53]], ptr [[TMP36]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 17
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 17
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP84:%.*]] = load i32, ptr [[TMP55]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP84]], ptr [[TMP38]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 18
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP87:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 18
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP59:%.*]] = load i32, ptr [[TMP87]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP59]], ptr [[TMP40]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 19
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 19
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP90:%.*]] = load i32, ptr [[TMP61]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP90]], ptr [[TMP42]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 20
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 20
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP65:%.*]] = load i32, ptr [[TMP64]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP65]], ptr [[TMP44]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 21
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 21
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP68:%.*]] = load i32, ptr [[TMP67]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP68]], ptr [[TMP46]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 22
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 22
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP71:%.*]] = load i32, ptr [[TMP70]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP71]], ptr [[TMP48]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 23
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 23
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP74:%.*]] = load i32, ptr [[TMP73]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP74]], ptr [[TMP50]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 24
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 24
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP77:%.*]] = load i32, ptr [[TMP76]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP77]], ptr [[TMP52]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 25
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP79:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 25
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP80:%.*]] = load i32, ptr [[TMP79]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP80]], ptr [[TMP54]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 26
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP82:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 26
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP83:%.*]] = load i32, ptr [[TMP82]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP83]], ptr [[TMP56]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 27
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP85:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 27
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP86:%.*]] = load i32, ptr [[TMP85]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP86]], ptr [[TMP58]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 28
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP88:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 28
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP89:%.*]] = load i32, ptr [[TMP88]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP89]], ptr [[TMP60]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 29
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP91:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 29
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP92:%.*]] = load i32, ptr [[TMP91]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP92]], ptr [[TMP62]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP102:%.*]] = load float, ptr [[TMP101]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP102]], ptr [[TMP100]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 0, i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 0, i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP98:%.*]] = load float, ptr [[TMP104]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP98]], ptr [[TMP103]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 0, i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP107:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 0, i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP108:%.*]] = load float, ptr [[TMP107]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP108]], ptr [[TMP106]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP109:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 0, i32 3
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP110:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 0, i32 3
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP105:%.*]] = load float, ptr [[TMP110]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP105]], ptr [[TMP109]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP112:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 0, i32 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP113:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 0, i32 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP114:%.*]] = load float, ptr [[TMP113]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP114]], ptr [[TMP112]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP115:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 0, i32 5
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP116:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 0, i32 5
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP111:%.*]] = load float, ptr [[TMP116]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP111]], ptr [[TMP115]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 0, i32 6
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 0, i32 6
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP120:%.*]] = load float, ptr [[TMP119]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP120]], ptr [[TMP118]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP121:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 0, i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP122:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 0, i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP117:%.*]] = load float, ptr [[TMP122]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP117]], ptr [[TMP121]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP124:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 0, i32 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP125:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 0, i32 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP126:%.*]] = load float, ptr [[TMP125]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP126]], ptr [[TMP124]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP127:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 0, i32 9
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP128:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 0, i32 9
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP123:%.*]] = load float, ptr [[TMP128]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP123]], ptr [[TMP127]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP130:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 0, i32 10
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 0, i32 10
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP132:%.*]] = load float, ptr [[TMP131]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP132]], ptr [[TMP130]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 0, i32 11
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 0, i32 11
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP129:%.*]] = load float, ptr [[TMP134]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP129]], ptr [[TMP133]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 0, i32 12
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 0, i32 12
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP138:%.*]] = load float, ptr [[TMP137]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP138]], ptr [[TMP136]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 0, i32 13
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP140:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 0, i32 13
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP135:%.*]] = load float, ptr [[TMP140]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP135]], ptr [[TMP139]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP142:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 0, i32 14
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 0, i32 14
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP144:%.*]] = load float, ptr [[TMP143]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP144]], ptr [[TMP142]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP145:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 1, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP146:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 1, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP141:%.*]] = load float, ptr [[TMP146]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP141]], ptr [[TMP145]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP148:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 1, i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP149:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 1, i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP147:%.*]] = load float, ptr [[TMP149]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP147]], ptr [[TMP148]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP151:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 1, i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP152:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 1, i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP150:%.*]] = load float, ptr [[TMP152]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP150]], ptr [[TMP151]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP154:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 1, i32 3
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP155:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 1, i32 3
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP153:%.*]] = load float, ptr [[TMP155]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP153]], ptr [[TMP154]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP157:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 1, i32 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP158:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 1, i32 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP156:%.*]] = load float, ptr [[TMP158]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP156]], ptr [[TMP157]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP160:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 1, i32 5
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP161:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 1, i32 5
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP159:%.*]] = load float, ptr [[TMP161]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP159]], ptr [[TMP160]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP163:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 1, i32 6
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP164:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 1, i32 6
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP162:%.*]] = load float, ptr [[TMP164]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP162]], ptr [[TMP163]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP166:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 1, i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP167:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 1, i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP165:%.*]] = load float, ptr [[TMP167]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP165]], ptr [[TMP166]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP169:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 1, i32 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP170:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 1, i32 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP168:%.*]] = load float, ptr [[TMP170]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP168]], ptr [[TMP169]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP172:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 1, i32 9
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP173:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 1, i32 9
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP171:%.*]] = load float, ptr [[TMP173]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP171]], ptr [[TMP172]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP175:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 1, i32 10
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP176:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 1, i32 10
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP174:%.*]] = load float, ptr [[TMP176]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP174]], ptr [[TMP175]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP178:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 1, i32 11
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP179:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 1, i32 11
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP177:%.*]] = load float, ptr [[TMP179]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP177]], ptr [[TMP178]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP181:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 1, i32 12
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP182:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 1, i32 12
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP180:%.*]] = load float, ptr [[TMP182]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP180]], ptr [[TMP181]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP184:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 1, i32 13
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP185:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 1, i32 13
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP276:%.*]] = load float, ptr [[TMP185]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP276]], ptr [[TMP184]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP187:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 1, i32 14
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP188:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 1, i32 14
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP278:%.*]] = load float, ptr [[TMP188]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP278]], ptr [[TMP187]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[DIS_DATA_I:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP183:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP281:%.*]] = load i32, ptr [[TMP183]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP281]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP284:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP186:%.*]] = getelementptr inbounds i32, ptr [[TMP183]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP287:%.*]] = load i32, ptr [[TMP186]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP287]], ptr [[TMP284]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP290:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP189:%.*]] = getelementptr inbounds i32, ptr [[TMP183]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP190:%.*]] = load i32, ptr [[TMP189]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP190]], ptr [[TMP290]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP191:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 3
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP192:%.*]] = getelementptr inbounds i32, ptr [[TMP183]], i32 3
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP193:%.*]] = load i32, ptr [[TMP192]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP193]], ptr [[TMP191]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP194:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP195:%.*]] = getelementptr inbounds i32, ptr [[TMP183]], i32 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP196:%.*]] = load i32, ptr [[TMP195]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP196]], ptr [[TMP194]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP197:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 5
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP198:%.*]] = getelementptr inbounds i32, ptr [[TMP183]], i32 5
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP199:%.*]] = load i32, ptr [[TMP198]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP199]], ptr [[TMP197]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP200:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 6
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP201:%.*]] = getelementptr inbounds i32, ptr [[TMP183]], i32 6
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP202:%.*]] = load i32, ptr [[TMP201]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP202]], ptr [[TMP200]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP203:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP204:%.*]] = getelementptr inbounds i32, ptr [[TMP183]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP205:%.*]] = load i32, ptr [[TMP204]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP205]], ptr [[TMP203]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP206:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP207:%.*]] = getelementptr inbounds i32, ptr [[TMP183]], i32 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP208:%.*]] = load i32, ptr [[TMP207]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP208]], ptr [[TMP206]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP209:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 9
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP210:%.*]] = getelementptr inbounds i32, ptr [[TMP183]], i32 9
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP211:%.*]] = load i32, ptr [[TMP210]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP211]], ptr [[TMP209]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP212:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 10
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP213:%.*]] = getelementptr inbounds i32, ptr [[TMP183]], i32 10
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP214:%.*]] = load i32, ptr [[TMP213]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP214]], ptr [[TMP212]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP215:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 11
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP216:%.*]] = getelementptr inbounds i32, ptr [[TMP183]], i32 11
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP217:%.*]] = load i32, ptr [[TMP216]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP217]], ptr [[TMP215]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP293:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 12
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP219:%.*]] = getelementptr inbounds i32, ptr [[TMP183]], i32 12
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP296:%.*]] = load i32, ptr [[TMP219]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP296]], ptr [[TMP293]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP221:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 13
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP299:%.*]] = getelementptr inbounds i32, ptr [[TMP183]], i32 13
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP302:%.*]] = load i32, ptr [[TMP299]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP302]], ptr [[TMP221]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP305:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 14
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP308:%.*]] = getelementptr inbounds i32, ptr [[TMP183]], i32 14
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP311:%.*]] = load i32, ptr [[TMP308]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP311]], ptr [[TMP305]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP227:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 15
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP314:%.*]] = getelementptr inbounds i32, ptr [[TMP183]], i32 15
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP229:%.*]] = load i32, ptr [[TMP314]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP229]], ptr [[TMP227]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP317:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 16
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP231:%.*]] = getelementptr inbounds i32, ptr [[TMP183]], i32 16
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP320:%.*]] = load i32, ptr [[TMP231]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP320]], ptr [[TMP317]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP233:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 17
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP323:%.*]] = getelementptr inbounds i32, ptr [[TMP183]], i32 17
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP235:%.*]] = load i32, ptr [[TMP323]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP235]], ptr [[TMP233]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP326:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 18
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP237:%.*]] = getelementptr inbounds i32, ptr [[TMP183]], i32 18
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP329:%.*]] = load i32, ptr [[TMP237]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP329]], ptr [[TMP326]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP239:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 19
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP332:%.*]] = getelementptr inbounds i32, ptr [[TMP183]], i32 19
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP241:%.*]] = load i32, ptr [[TMP332]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP241]], ptr [[TMP239]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP335:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 20
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP243:%.*]] = getelementptr inbounds i32, ptr [[TMP183]], i32 20
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP338:%.*]] = load i32, ptr [[TMP243]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP338]], ptr [[TMP335]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP245:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 21
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP341:%.*]] = getelementptr inbounds i32, ptr [[TMP183]], i32 21
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP247:%.*]] = load i32, ptr [[TMP341]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP247]], ptr [[TMP245]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP344:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 22
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP249:%.*]] = getelementptr inbounds i32, ptr [[TMP183]], i32 22
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP347:%.*]] = load i32, ptr [[TMP249]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP347]], ptr [[TMP344]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP251:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 23
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP350:%.*]] = getelementptr inbounds i32, ptr [[TMP183]], i32 23
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP253:%.*]] = load i32, ptr [[TMP350]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP253]], ptr [[TMP251]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP353:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 24
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP255:%.*]] = getelementptr inbounds i32, ptr [[TMP183]], i32 24
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP356:%.*]] = load i32, ptr [[TMP255]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP356]], ptr [[TMP353]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP257:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 25
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP359:%.*]] = getelementptr inbounds i32, ptr [[TMP183]], i32 25
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP259:%.*]] = load i32, ptr [[TMP359]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP259]], ptr [[TMP257]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP362:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 26
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP261:%.*]] = getelementptr inbounds i32, ptr [[TMP183]], i32 26
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP365:%.*]] = load i32, ptr [[TMP261]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP365]], ptr [[TMP362]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP263:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 27
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP396:%.*]] = getelementptr inbounds i32, ptr [[TMP183]], i32 27
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP265:%.*]] = load i32, ptr [[TMP396]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP265]], ptr [[TMP263]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP485:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 28
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP267:%.*]] = getelementptr inbounds i32, ptr [[TMP183]], i32 28
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP487:%.*]] = load i32, ptr [[TMP267]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP487]], ptr [[TMP485]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP269:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 29
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP270:%.*]] = getelementptr inbounds i32, ptr [[TMP183]], i32 29
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP488:%.*]] = load i32, ptr [[TMP270]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP488]], ptr [[TMP269]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP272:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP274:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [4 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa4i32a30i32s(i32 2, i32 4, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [4 x i32] poison, [30 x i32] [[TMP272]]), !continuation.registercount [[META24]], !continuation.returnedRegistercount [[META24]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP490:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [4 x i32], [30 x i32] } [[TMP274]], 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [30 x i32] [[TMP490]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP358:%.*]] = freeze [[STRUCT_OUTERPAYLOAD]] poison
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_OUTERPAYLOAD]] [[TMP358]], ptr [[TMP2]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP224:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP277:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP277]], ptr [[TMP224]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP218:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP279:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP280:%.*]] = load i32, ptr [[TMP279]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP280]], ptr [[TMP218]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP220:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP282:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP283:%.*]] = load i32, ptr [[TMP282]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP283]], ptr [[TMP220]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP222:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 3
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP285:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 3
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP286:%.*]] = load i32, ptr [[TMP285]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP286]], ptr [[TMP222]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP225:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP288:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP289:%.*]] = load i32, ptr [[TMP288]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP289]], ptr [[TMP225]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP226:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 5
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP291:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 5
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP292:%.*]] = load i32, ptr [[TMP291]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP292]], ptr [[TMP226]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP228:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 6
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP294:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 6
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP295:%.*]] = load i32, ptr [[TMP294]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP295]], ptr [[TMP228]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP230:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP297:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP298:%.*]] = load i32, ptr [[TMP297]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP298]], ptr [[TMP230]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP232:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP300:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP301:%.*]] = load i32, ptr [[TMP300]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP301]], ptr [[TMP232]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP234:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 9
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP303:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 9
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP304:%.*]] = load i32, ptr [[TMP303]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP304]], ptr [[TMP234]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP236:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 10
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP306:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 10
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP307:%.*]] = load i32, ptr [[TMP306]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP307]], ptr [[TMP236]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP238:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 11
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP309:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 11
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP310:%.*]] = load i32, ptr [[TMP309]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP310]], ptr [[TMP238]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP240:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 12
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP312:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 12
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP313:%.*]] = load i32, ptr [[TMP312]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP313]], ptr [[TMP240]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP242:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 13
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP315:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 13
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP316:%.*]] = load i32, ptr [[TMP315]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP316]], ptr [[TMP242]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP244:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 14
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP491:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 14
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP319:%.*]] = load i32, ptr [[TMP491]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP319]], ptr [[TMP244]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP246:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 15
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP321:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 15
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP322:%.*]] = load i32, ptr [[TMP321]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP322]], ptr [[TMP246]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP248:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 16
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP324:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 16
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP325:%.*]] = load i32, ptr [[TMP324]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP325]], ptr [[TMP248]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP250:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 17
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP327:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 17
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP328:%.*]] = load i32, ptr [[TMP327]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP328]], ptr [[TMP250]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP252:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 18
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP330:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 18
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP331:%.*]] = load i32, ptr [[TMP330]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP331]], ptr [[TMP252]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP254:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 19
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP333:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 19
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP334:%.*]] = load i32, ptr [[TMP333]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP334]], ptr [[TMP254]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP256:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 20
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP336:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 20
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP337:%.*]] = load i32, ptr [[TMP336]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP337]], ptr [[TMP256]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP258:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 21
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP339:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 21
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP340:%.*]] = load i32, ptr [[TMP339]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP340]], ptr [[TMP258]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP260:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 22
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP342:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 22
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP343:%.*]] = load i32, ptr [[TMP342]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP343]], ptr [[TMP260]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP262:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 23
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP345:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 23
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP346:%.*]] = load i32, ptr [[TMP345]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP346]], ptr [[TMP262]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP264:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 24
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP348:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 24
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP349:%.*]] = load i32, ptr [[TMP348]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP349]], ptr [[TMP264]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP266:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 25
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP351:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 25
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP352:%.*]] = load i32, ptr [[TMP351]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP352]], ptr [[TMP266]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP268:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 26
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP354:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 26
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP355:%.*]] = load i32, ptr [[TMP354]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP355]], ptr [[TMP268]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP271:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 27
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP357:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 27
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP489:%.*]] = load i32, ptr [[TMP357]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP489]], ptr [[TMP271]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP273:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 28
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP360:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 28
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP361:%.*]] = load i32, ptr [[TMP360]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP361]], ptr [[TMP273]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP275:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 29
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP363:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 29
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP364:%.*]] = load i32, ptr [[TMP363]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP364]], ptr [[TMP275]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP223:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [4 x i32], [30 x i32] } [[TMP274]], 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP223]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
-; LOWERRAYTRACINGPIPELINE-NEXT:    br label [[DOTSPLIT:%.*]]
-; LOWERRAYTRACINGPIPELINE:       .split:
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP366:%.*]] = load float, ptr [[TMP100]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP366]], ptr [[TMP101]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP367:%.*]] = load float, ptr [[TMP103]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP367]], ptr [[TMP104]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP368:%.*]] = load float, ptr [[TMP106]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP368]], ptr [[TMP107]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP369:%.*]] = load float, ptr [[TMP109]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP369]], ptr [[TMP110]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP370:%.*]] = load float, ptr [[TMP112]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP370]], ptr [[TMP113]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP371:%.*]] = load float, ptr [[TMP115]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP371]], ptr [[TMP116]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP372:%.*]] = load float, ptr [[TMP118]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP372]], ptr [[TMP119]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP373:%.*]] = load float, ptr [[TMP121]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP373]], ptr [[TMP122]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP374:%.*]] = load float, ptr [[TMP124]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP374]], ptr [[TMP125]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP375:%.*]] = load float, ptr [[TMP127]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP375]], ptr [[TMP128]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP376:%.*]] = load float, ptr [[TMP130]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP376]], ptr [[TMP131]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP377:%.*]] = load float, ptr [[TMP133]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP377]], ptr [[TMP134]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP378:%.*]] = load float, ptr [[TMP136]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP378]], ptr [[TMP137]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP379:%.*]] = load float, ptr [[TMP139]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP379]], ptr [[TMP140]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP380:%.*]] = load float, ptr [[TMP142]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP380]], ptr [[TMP143]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP381:%.*]] = load float, ptr [[TMP145]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP381]], ptr [[TMP146]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP492:%.*]] = load float, ptr [[TMP148]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP492]], ptr [[TMP149]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP383:%.*]] = load float, ptr [[TMP151]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP383]], ptr [[TMP152]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP384:%.*]] = load float, ptr [[TMP154]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP384]], ptr [[TMP155]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP385:%.*]] = load float, ptr [[TMP157]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP385]], ptr [[TMP158]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP386:%.*]] = load float, ptr [[TMP160]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP386]], ptr [[TMP161]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP387:%.*]] = load float, ptr [[TMP163]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP387]], ptr [[TMP164]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP388:%.*]] = load float, ptr [[TMP166]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP388]], ptr [[TMP167]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP389:%.*]] = load float, ptr [[TMP169]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP389]], ptr [[TMP170]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP390:%.*]] = load float, ptr [[TMP172]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP390]], ptr [[TMP173]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP391:%.*]] = load float, ptr [[TMP175]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP391]], ptr [[TMP176]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP392:%.*]] = load float, ptr [[TMP178]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP392]], ptr [[TMP179]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP393:%.*]] = load float, ptr [[TMP181]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP393]], ptr [[TMP182]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP394:%.*]] = load float, ptr [[TMP184]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP394]], ptr [[TMP185]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP395:%.*]] = load float, ptr [[TMP187]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP395]], ptr [[TMP188]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP318:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP397:%.*]] = load i32, ptr [[TMP318]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP397]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP398:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP399:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP400:%.*]] = load i32, ptr [[TMP399]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP400]], ptr [[TMP398]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP401:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP402:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP403:%.*]] = load i32, ptr [[TMP402]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP403]], ptr [[TMP401]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP404:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 3
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP405:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 3
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP406:%.*]] = load i32, ptr [[TMP405]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP406]], ptr [[TMP404]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP407:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP408:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP409:%.*]] = load i32, ptr [[TMP408]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP409]], ptr [[TMP407]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP410:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 5
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP411:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 5
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP412:%.*]] = load i32, ptr [[TMP411]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP412]], ptr [[TMP410]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP413:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 6
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP414:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 6
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP415:%.*]] = load i32, ptr [[TMP414]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP415]], ptr [[TMP413]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP416:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP417:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP418:%.*]] = load i32, ptr [[TMP417]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP418]], ptr [[TMP416]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP419:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP420:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP421:%.*]] = load i32, ptr [[TMP420]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP421]], ptr [[TMP419]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP422:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 9
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP423:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 9
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP424:%.*]] = load i32, ptr [[TMP423]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP424]], ptr [[TMP422]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP425:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 10
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP426:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 10
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP427:%.*]] = load i32, ptr [[TMP426]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP427]], ptr [[TMP425]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP428:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 11
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP429:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 11
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP430:%.*]] = load i32, ptr [[TMP429]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP430]], ptr [[TMP428]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP431:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 12
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP432:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 12
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP433:%.*]] = load i32, ptr [[TMP432]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP433]], ptr [[TMP431]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP434:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 13
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP435:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 13
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP436:%.*]] = load i32, ptr [[TMP435]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP436]], ptr [[TMP434]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP437:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 14
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP438:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 14
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP439:%.*]] = load i32, ptr [[TMP438]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP439]], ptr [[TMP437]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP440:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 15
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP441:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 15
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP442:%.*]] = load i32, ptr [[TMP441]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP442]], ptr [[TMP440]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP443:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 16
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP444:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 16
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP445:%.*]] = load i32, ptr [[TMP444]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP445]], ptr [[TMP443]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP446:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 17
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP447:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 17
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP448:%.*]] = load i32, ptr [[TMP447]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP448]], ptr [[TMP446]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP449:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 18
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP450:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 18
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP451:%.*]] = load i32, ptr [[TMP450]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP451]], ptr [[TMP449]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP452:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 19
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP453:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 19
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP454:%.*]] = load i32, ptr [[TMP453]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP454]], ptr [[TMP452]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP455:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 20
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP456:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 20
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP457:%.*]] = load i32, ptr [[TMP456]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP457]], ptr [[TMP455]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP458:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 21
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP459:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 21
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP460:%.*]] = load i32, ptr [[TMP459]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP460]], ptr [[TMP458]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP461:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 22
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP462:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 22
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP463:%.*]] = load i32, ptr [[TMP462]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP463]], ptr [[TMP461]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP464:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 23
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP465:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 23
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP466:%.*]] = load i32, ptr [[TMP465]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP466]], ptr [[TMP464]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP467:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 24
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP468:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 24
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP469:%.*]] = load i32, ptr [[TMP468]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP469]], ptr [[TMP467]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP470:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 25
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP471:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 25
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP472:%.*]] = load i32, ptr [[TMP471]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP472]], ptr [[TMP470]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP473:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 26
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP474:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 26
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP475:%.*]] = load i32, ptr [[TMP474]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP475]], ptr [[TMP473]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP476:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 27
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP477:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 27
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP478:%.*]] = load i32, ptr [[TMP477]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP478]], ptr [[TMP476]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP479:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 28
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP480:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 28
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP481:%.*]] = load i32, ptr [[TMP480]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP481]], ptr [[TMP479]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP482:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 29
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP483:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 29
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP484:%.*]] = load i32, ptr [[TMP483]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP484]], ptr [[TMP482]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP382:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP486:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP382]], [4 x i32] poison, [30 x i32] [[TMP486]]), !continuation.registercount [[META24]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
-;
-  %1 = alloca %struct.OuterPayload, align 8
-  %2 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %1, i32 0, i32 0, i32 0
-  %3 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %outerPayload, i32 0, i32 0, i32 0
-  %4 = load float, float* %3, align 4
-  store float %4, float* %2, align 8
-  %5 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %1, i32 0, i32 0, i32 1
-  %6 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %outerPayload, i32 0, i32 0, i32 1
-  %7 = load float, float* %6, align 4
-  store float %7, float* %5, align 4
-  %8 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %1, i32 0, i32 0, i32 2
-  %9 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %outerPayload, i32 0, i32 0, i32 2
-  %10 = load float, float* %9, align 4
-  store float %10, float* %8, align 8
-  %11 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %1, i32 0, i32 0, i32 3
-  %12 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %outerPayload, i32 0, i32 0, i32 3
-  %13 = load float, float* %12, align 4
-  store float %13, float* %11, align 4
-  %14 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %1, i32 0, i32 0, i32 4
-  %15 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %outerPayload, i32 0, i32 0, i32 4
-  %16 = load float, float* %15, align 4
-  store float %16, float* %14, align 8
-  %17 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %1, i32 0, i32 0, i32 5
-  %18 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %outerPayload, i32 0, i32 0, i32 5
-  %19 = load float, float* %18, align 4
-  store float %19, float* %17, align 4
-  %20 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %1, i32 0, i32 0, i32 6
-  %21 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %outerPayload, i32 0, i32 0, i32 6
-  %22 = load float, float* %21, align 4
-  store float %22, float* %20, align 8
-  %23 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %1, i32 0, i32 0, i32 7
-  %24 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %outerPayload, i32 0, i32 0, i32 7
-  %25 = load float, float* %24, align 4
-  store float %25, float* %23, align 4
-  %26 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %1, i32 0, i32 0, i32 8
-  %27 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %outerPayload, i32 0, i32 0, i32 8
-  %28 = load float, float* %27, align 4
-  store float %28, float* %26, align 8
-  %29 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %1, i32 0, i32 0, i32 9
-  %30 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %outerPayload, i32 0, i32 0, i32 9
-  %31 = load float, float* %30, align 4
-  store float %31, float* %29, align 4
-  %32 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %1, i32 0, i32 0, i32 10
-  %33 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %outerPayload, i32 0, i32 0, i32 10
-  %34 = load float, float* %33, align 4
-  store float %34, float* %32, align 8
-  %35 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %1, i32 0, i32 0, i32 11
-  %36 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %outerPayload, i32 0, i32 0, i32 11
-  %37 = load float, float* %36, align 4
-  store float %37, float* %35, align 4
-  %38 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %1, i32 0, i32 0, i32 12
-  %39 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %outerPayload, i32 0, i32 0, i32 12
-  %40 = load float, float* %39, align 4
-  store float %40, float* %38, align 8
-  %41 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %1, i32 0, i32 0, i32 13
-  %42 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %outerPayload, i32 0, i32 0, i32 13
-  %43 = load float, float* %42, align 4
-  store float %43, float* %41, align 4
-  %44 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %1, i32 0, i32 0, i32 14
-  %45 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %outerPayload, i32 0, i32 0, i32 14
-  %46 = load float, float* %45, align 4
-  store float %46, float* %44, align 8
-  %47 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %1, i32 0, i32 1, i32 0
-  %48 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %outerPayload, i32 0, i32 1, i32 0
-  %49 = load float, float* %48, align 4
-  store float %49, float* %47, align 4
-  %50 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %1, i32 0, i32 1, i32 1
-  %51 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %outerPayload, i32 0, i32 1, i32 1
-  %52 = load float, float* %51, align 4
-  store float %52, float* %50, align 4
-  %53 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %1, i32 0, i32 1, i32 2
-  %54 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %outerPayload, i32 0, i32 1, i32 2
-  %55 = load float, float* %54, align 4
-  store float %55, float* %53, align 4
-  %56 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %1, i32 0, i32 1, i32 3
-  %57 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %outerPayload, i32 0, i32 1, i32 3
-  %58 = load float, float* %57, align 4
-  store float %58, float* %56, align 4
-  %59 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %1, i32 0, i32 1, i32 4
-  %60 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %outerPayload, i32 0, i32 1, i32 4
-  %61 = load float, float* %60, align 4
-  store float %61, float* %59, align 4
-  %62 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %1, i32 0, i32 1, i32 5
-  %63 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %outerPayload, i32 0, i32 1, i32 5
-  %64 = load float, float* %63, align 4
-  store float %64, float* %62, align 4
-  %65 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %1, i32 0, i32 1, i32 6
-  %66 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %outerPayload, i32 0, i32 1, i32 6
-  %67 = load float, float* %66, align 4
-  store float %67, float* %65, align 4
-  %68 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %1, i32 0, i32 1, i32 7
-  %69 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %outerPayload, i32 0, i32 1, i32 7
-  %70 = load float, float* %69, align 4
-  store float %70, float* %68, align 4
-  %71 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %1, i32 0, i32 1, i32 8
-  %72 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %outerPayload, i32 0, i32 1, i32 8
-  %73 = load float, float* %72, align 4
-  store float %73, float* %71, align 4
-  %74 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %1, i32 0, i32 1, i32 9
-  %75 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %outerPayload, i32 0, i32 1, i32 9
-  %76 = load float, float* %75, align 4
-  store float %76, float* %74, align 4
-  %77 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %1, i32 0, i32 1, i32 10
-  %78 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %outerPayload, i32 0, i32 1, i32 10
-  %79 = load float, float* %78, align 4
-  store float %79, float* %77, align 4
-  %80 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %1, i32 0, i32 1, i32 11
-  %81 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %outerPayload, i32 0, i32 1, i32 11
-  %82 = load float, float* %81, align 4
-  store float %82, float* %80, align 4
-  %83 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %1, i32 0, i32 1, i32 12
-  %84 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %outerPayload, i32 0, i32 1, i32 12
-  %85 = load float, float* %84, align 4
-  store float %85, float* %83, align 4
-  %86 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %1, i32 0, i32 1, i32 13
-  %87 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %outerPayload, i32 0, i32 1, i32 13
-  %88 = load float, float* %87, align 4
-  store float %88, float* %86, align 4
-  %89 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %1, i32 0, i32 1, i32 14
-  %90 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %outerPayload, i32 0, i32 1, i32 14
-  %91 = load float, float* %90, align 4
-  store float %91, float* %89, align 4
-  call void @dx.op.callShader.struct.OuterPayload(i32 159, i32 0, %struct.OuterPayload* nonnull %1)
-  %92 = load float, float* %2, align 8
-  store float %92, float* %3, align 4
-  %93 = load float, float* %5, align 4
-  store float %93, float* %6, align 4
-  %94 = load float, float* %8, align 8
-  store float %94, float* %9, align 4
-  %95 = load float, float* %11, align 4
-  store float %95, float* %12, align 4
-  %96 = load float, float* %14, align 8
-  store float %96, float* %15, align 4
-  %97 = load float, float* %17, align 4
-  store float %97, float* %18, align 4
-  %98 = load float, float* %20, align 8
-  store float %98, float* %21, align 4
-  %99 = load float, float* %23, align 4
-  store float %99, float* %24, align 4
-  %100 = load float, float* %26, align 8
-  store float %100, float* %27, align 4
-  %101 = load float, float* %29, align 4
-  store float %101, float* %30, align 4
-  %102 = load float, float* %32, align 8
-  store float %102, float* %33, align 4
-  %103 = load float, float* %35, align 4
-  store float %103, float* %36, align 4
-  %104 = load float, float* %38, align 8
-  store float %104, float* %39, align 4
-  %105 = load float, float* %41, align 4
-  store float %105, float* %42, align 4
-  %106 = load float, float* %44, align 8
-  store float %106, float* %45, align 4
-  %107 = load float, float* %47, align 4
-  store float %107, float* %48, align 4
-  %108 = load float, float* %50, align 4
-  store float %108, float* %51, align 4
-  %109 = load float, float* %53, align 4
-  store float %109, float* %54, align 4
-  %110 = load float, float* %56, align 4
-  store float %110, float* %57, align 4
-  %111 = load float, float* %59, align 4
-  store float %111, float* %60, align 4
-  %112 = load float, float* %62, align 4
-  store float %112, float* %63, align 4
-  %113 = load float, float* %65, align 4
-  store float %113, float* %66, align 4
-  %114 = load float, float* %68, align 4
-  store float %114, float* %69, align 4
-  %115 = load float, float* %71, align 4
-  store float %115, float* %72, align 4
-  %116 = load float, float* %74, align 4
-  store float %116, float* %75, align 4
-  %117 = load float, float* %77, align 4
-  store float %117, float* %78, align 4
-  %118 = load float, float* %80, align 4
-  store float %118, float* %81, align 4
-  %119 = load float, float* %83, align 4
-  store float %119, float* %84, align 4
-  %120 = load float, float* %86, align 4
-  store float %120, float* %87, align 4
-  %121 = load float, float* %89, align 4
-  store float %121, float* %90, align 4
-  ret void
-}
-
-; Function Attrs: nounwind
-declare !pointeetys !29 void @dx.op.traceRay.struct.InnerPayload(i32, %dx.types.Handle, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, %struct.InnerPayload*) #0
-
-; Function Attrs: nounwind
-declare !pointeetys !31 void @dx.op.callShader.struct.OuterPayload(i32, i32, %struct.OuterPayload*) #0
-
-; Function Attrs: nounwind memory(none)
-declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
-
-; Function Attrs: nounwind memory(read)
-declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.types.Handle) #2
-
-; Function Attrs: alwaysinline
-declare %struct.DispatchSystemData @_AmdAwaitTraversal(i32, %struct.TraversalData) #3
-
-; Function Attrs: alwaysinline
-declare %struct.DispatchSystemData @_AmdAwaitShader(i32, i32, %struct.DispatchSystemData) #3
-
-; Function Attrs: alwaysinline
-declare !pointeetys !32 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData*) #3
-
-; Function Attrs: alwaysinline
-declare !pointeetys !34 void @_cont_SetTriangleHitAttributes(%struct.SystemData*, %struct.BuiltInTriangleIntersectionAttributes) #3
-
-; Function Attrs: alwaysinline
-declare !pointeetys !35 i1 @_cont_IsEndSearch(%struct.TraversalData*) #3
-
-; Function Attrs: nounwind memory(read)
-declare !pointeetys !37 i32 @_cont_HitKind(%struct.SystemData* nocapture readnone, %struct.HitData*) #2
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !39 void @_AmdRestoreSystemData(%struct.DispatchSystemData*) #1
-
-; Function Attrs: nounwind memory(none)
-declare !pointeetys !41 void @_AmdRestoreSystemDataAnyHit(%struct.AnyHitTraversalData*) #1
-
-declare !pointeetys !43 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
-
-; Function Attrs: alwaysinline
-define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) #3 !pointeetys !43 {
-; LOWERRAYTRACINGPIPELINE-LABEL: define i32 @_cont_GetLocalRootIndex(
-; LOWERRAYTRACINGPIPELINE-SAME: ptr [[DATA:%.*]]) #[[ATTR3:[0-9]+]] {
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret i32 5
-;
-  ret i32 5
-}
-
-; Function Attrs: alwaysinline
-define void @_cont_TraceRay(%struct.DispatchSystemData* %data, i64 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, float %6, float %7, float %8, float %9, float %10, float %11, float %12, float %13) #3 !pointeetys !44 {
-  %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4
-  %sys_data = insertvalue %struct.SystemData undef, %struct.DispatchSystemData %dis_data, 0
-  %trav_data = insertvalue %struct.TraversalData undef, %struct.SystemData %sys_data, 0
-  %newdata = call %struct.DispatchSystemData @_AmdAwaitTraversal(i32 4, %struct.TraversalData %trav_data)
-  store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4
-  call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data)
-  ret void
-}
-
-; Function Attrs: alwaysinline
-define void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) #3 !pointeetys !45 {
-  %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4
-  %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i32 2, i32 poison, %struct.DispatchSystemData %dis_data)
-  store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4
-  call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data)
-  ret void
-}
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare !pointeetys !46 void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #4
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare !pointeetys !46 void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #4
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind memory(none) }
-attributes #2 = { nounwind memory(read) }
-attributes #3 = { alwaysinline }
-attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
-
-!llvm.ident = !{!0}
-!dx.version = !{!1}
-!dx.valver = !{!1}
-!dx.shaderModel = !{!2}
-!dx.resources = !{!3}
-!dx.typeAnnotations = !{!7}
-!dx.dxrPayloadAnnotations = !{!12}
-!dx.entryPoints = !{!17, !18, !21}
-!lgc.rt.max.attribute.size = !{!49}
-
-!0 = !{!"dxcoob 2019.05.00"}
-!1 = !{i32 1, i32 7}
-!2 = !{!"lib", i32 6, i32 7}
-!3 = !{!4, null, null, null}
-!4 = !{!5}
-!5 = !{i32 0, %struct.RaytracingAccelerationStructure* bitcast (%dx.types.Handle* @"\01?myAccelerationStructure@@3URaytracingAccelerationStructure@@A" to %struct.RaytracingAccelerationStructure*), !"myAccelerationStructure", i32 0, i32 3, i32 1, i32 16, i32 0, !6}
-!6 = !{i32 0, i32 4}
-!7 = !{i32 1, void (%struct.OuterPayload*)* @Miss, !8, void (%struct.OuterPayload*)* @Callable, !8}
-!8 = !{!9, !11}
-!9 = !{i32 1, !10, !10}
-!10 = !{}
-!11 = !{i32 2, !10, !10}
-!12 = !{i32 0, %struct.OuterPayload undef, !13, %struct.InnerPayload undef, !16}
-!13 = !{!14, !15}
-!14 = !{i32 0, i32 771}
-!15 = !{i32 0, i32 3}
-!16 = !{!15}
-!17 = !{null, !"", null, !3, null}
-!18 = !{void (%struct.OuterPayload*)* @Miss, !"Miss", null, null, !19}
-!19 = !{i32 8, i32 11, i32 6, i32 120, i32 5, !20}
-!20 = !{i32 0}
-!21 = !{void (%struct.OuterPayload*)* @Callable, !"Callable", null, null, !22}
-!22 = !{i32 8, i32 12, i32 6, i32 120, i32 5, !20}
-!23 = !{%struct.OuterPayload poison}
-!24 = !{i32 0, %struct.OuterPayload poison}
-!25 = !{!26, !26, i64 0}
-!26 = !{!"float", !27, i64 0}
-!27 = !{!"omnipotent char", !28, i64 0}
-!28 = !{!"Simple C/C++ TBAA"}
-!29 = !{%struct.InnerPayload poison}
-!30 = !{i32 0, %struct.InnerPayload poison}
-!31 = !{%struct.OuterPayload poison}
-!32 = !{%struct.SystemData poison}
-!33 = !{i32 0, %struct.SystemData poison}
-!34 = !{%struct.SystemData poison}
-!35 = !{%struct.TraversalData poison}
-!36 = !{i32 0, %struct.TraversalData poison}
-!37 = !{null, %struct.SystemData poison, %struct.HitData poison}
-!38 = !{i32 0, %struct.HitData poison}
-!39 = !{%struct.DispatchSystemData poison}
-!40 = !{i32 0, %struct.DispatchSystemData poison}
-!41 = !{%struct.AnyHitTraversalData poison}
-!42 = !{i32 0, %struct.AnyHitTraversalData poison}
-!43 = !{%struct.DispatchSystemData poison}
-!44 = !{%struct.DispatchSystemData poison}
-!45 = !{%struct.DispatchSystemData poison}
-!46 = !{i8 poison}
-!47 = !{i32 0, i8 poison}
-!48 = !{%struct.AnyHitTraversalData poison}
-!49 = !{i32 8}
diff --git a/llvmraytracing/test/dx/payload-save-registers.ll.hlsl b/llvmraytracing/test/dx/payload-save-registers.ll.hlsl
deleted file mode 100644
index 0baf77eb28..0000000000
--- a/llvmraytracing/test/dx/payload-save-registers.ll.hlsl
+++ /dev/null
@@ -1,34 +0,0 @@
-// This file is not a test itself, but used to generate the .ll test file.
-
-struct[raypayload] OuterPayload {
-  // These are written in miss, so they are not saved before recursive
-  // TraceRay in miss
-  float v1[15] : write(caller, miss) : read(caller, miss);
-  // These need to be saved before recursive TraceRay.
-  // However, these are only partially in registers,
-  // so are only saved partially. The memory part does not need
-  // to be saved.
-  float v2[15] : write(caller) : read(caller);
-};
-
-struct[raypayload] InnerPayload {
-  float v1 : write(caller) : read(caller);
-};
-
-RaytracingAccelerationStructure myAccelerationStructure : register(t3);
-RWTexture2D<float4> gOutput : register(u0);
-
-[shader("miss")] void Miss(inout OuterPayload outerPayload) {
-  InnerPayload innerPayload;
-  innerPayload.v1 = outerPayload.v1[14];
-
-  RayDesc myRay = {float3(0., 0., 0.), 0., float3(0., 0., 0.), 1.0};
-
-  TraceRay(myAccelerationStructure, 0, 0, 0, 0, 0, myRay, innerPayload);
-
-  outerPayload.v1[14] = innerPayload.v1;
-}
-
-    [shader("callable")] void callable(inout OuterPayload outerPayload) {
-  CallShader(0, outerPayload);
-}
diff --git a/llvmraytracing/test/dx/remat-indirect-load.ll b/llvmraytracing/test/dx/remat-indirect-load.ll
deleted file mode 100644
index 73ccacc1f6..0000000000
--- a/llvmraytracing/test/dx/remat-indirect-load.ll
+++ /dev/null
@@ -1,46 +0,0 @@
-; NOTE: Do not autogenerate
-; RUN: opt -debug-only=dxil-coro-split -passes='dxil-coro-split' -S %s 2>&1 | FileCheck %s
-;
-; Test that an indirect handle load pattern does not produce a rematerialization
-; warning. We know that remat in this case is not profitable.
-;
-; REQUIRES: assertions
-
-; CHECK-NOT: Warning: isRematerializableDxilLoad unhandled pattern: {{.*}} = extractvalue %dx.types.ResRet.i32
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-target triple = "dxil-ms-dx"
-
-%dx.types.ResRet.i32 = type { i32, i32, i32, i32, i32 }
-%dx.types.Handle = type { ptr }
-
-; Function Attrs: presplitcoroutine
-define { ptr, ptr } @"indirect_handle_load"() #0 {
-_cont_RayTCurrent.exit:
-  %0 = call token @llvm.coro.id.retcon(i32 0, i32 0, ptr null, ptr @"continuation.prototype.indirect_handle_load", ptr @continuation.malloc, ptr @continuation.free)
-  %1 = call ptr @llvm.coro.begin(token %0, ptr null)
-  %2 = call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 0, %dx.types.Handle zeroinitializer)
-  %3 = extractvalue %dx.types.ResRet.i32 %2, 0
-  %4 = call %dx.types.Handle @dx.op.createHandleFromHeap(i32 0, i32 %3)
-  %5 = call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 0, %dx.types.Handle %4)
-  ret { ptr, ptr } zeroinitializer
-}
-
-declare %dx.types.Handle @dx.op.createHandleFromHeap(i32, i32)
-
-declare %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32, %dx.types.Handle)
-
-declare ptr @continuation.malloc(i32)
-
-declare void @continuation.free(ptr)
-
-; Function Attrs: nounwind
-declare token @llvm.coro.id.retcon(i32, i32, ptr, ptr, ptr, ptr) #1
-
-; Function Attrs: nounwind
-declare ptr @llvm.coro.begin(token, ptr writeonly) #1
-
-declare { ptr, ptr } @"continuation.prototype.indirect_handle_load"(ptr)
-
-attributes #0 = { presplitcoroutine }
-attributes #1 = { nounwind }
diff --git a/llvmraytracing/test/dx/remove-types-metadata.ll b/llvmraytracing/test/dx/remove-types-metadata.ll
deleted file mode 100644
index 8076be668b..0000000000
--- a/llvmraytracing/test/dx/remove-types-metadata.ll
+++ /dev/null
@@ -1,545 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals --version 3
-; RUN: opt --verify-each -passes='continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=METADATA %s
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-%dx.types.Handle = type { i8* }
-%struct.DispatchSystemData = type { <3 x i32> }
-%struct.TraversalData = type { %struct.SystemData, %struct.HitData, <3 x float>, <3 x float>, float, i32 }
-%struct.SystemData = type { %struct.DispatchSystemData }
-%struct.HitData = type { <3 x float>, <3 x float>, float, i32 }
-%struct.AnyHitTraversalData = type { %struct.TraversalData, %struct.HitData }
-%struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
-%struct.RayPayload = type { <4 x float> }
-%dx.types.ResourceProperties = type { i32, i32 }
-%struct.RaytracingAccelerationStructure = type { i32 }
-%"class.RWTexture2D<vector<float, 4> >" = type { <4 x float> }
-
-@"\01?Scene@@3URaytracingAccelerationStructure@@A" = external constant %dx.types.Handle, align 4
-@"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" = external constant %dx.types.Handle, align 4
-
-declare i32 @_cont_GetContinuationStackAddr() #0
-
-declare %struct.DispatchSystemData @_AmdAwaitTraversal(i32, %struct.TraversalData) #0
-
-declare %struct.DispatchSystemData @_AmdAwaitShader(i32, %struct.DispatchSystemData) #0
-
-declare %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i32, %struct.AnyHitTraversalData, float, i32) #0
-
-declare !pointeetys !31 %struct.HitData @_cont_GetCandidateState(%struct.AnyHitTraversalData*) #0
-
-declare !pointeetys !33 %struct.HitData @_cont_GetCommittedState(%struct.SystemData*) #0
-
-declare !pointeetys !35 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData*) #0
-
-declare !pointeetys !36 void @_cont_SetTriangleHitAttributes(%struct.SystemData*, %struct.BuiltInTriangleIntersectionAttributes) #0
-
-declare !pointeetys !37 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
-
-declare !pointeetys !39 i1 @_cont_IsEndSearch(%struct.TraversalData*) #0
-
-declare !pointeetys !41 i32 @_cont_HitKind(%struct.SystemData*) #0
-
-; Function Attrs: nounwind
-declare i32 @_AmdGetResumePointAddr() #1
-
-; Function Attrs: nounwind
-declare !pointeetys !42 void @_AmdRestoreSystemData(%struct.DispatchSystemData*) #1
-
-; Function Attrs: nounwind
-declare !pointeetys !43 void @_AmdRestoreSystemDataAnyHit(%struct.AnyHitTraversalData*) #1
-
-; Function Attrs: nounwind
-declare !pointeetys !42 void @_cont_AcceptHitAndEndSearch(%struct.DispatchSystemData* nocapture readnone) #1
-
-; Function Attrs: nounwind
-declare !pointeetys !43 void @_cont_AcceptHit(%struct.AnyHitTraversalData* nocapture readnone) #1
-
-; Function Attrs: nounwind
-declare !pointeetys !42 void @_cont_IgnoreHit(%struct.DispatchSystemData* nocapture readnone) #1
-
-; Function Attrs: nounwind
-declare !pointeetys !43 void @_AmdAcceptHitAttributes(%struct.AnyHitTraversalData* nocapture readnone) #1
-
-define void @_cont_TraceRay(%struct.DispatchSystemData* %data, i64 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, float %6, float %7, float %8, float %9, float %10, float %11, float %12, float %13) #0 !pointeetys !44 {
-; METADATA-LABEL: define void @_cont_TraceRay(
-; METADATA-SAME: ptr [[DATA:%.*]], i64 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], i32 [[TMP3:%.*]], i32 [[TMP4:%.*]], i32 [[TMP5:%.*]], float [[TMP6:%.*]], float [[TMP7:%.*]], float [[TMP8:%.*]], float [[TMP9:%.*]], float [[TMP10:%.*]], float [[TMP11:%.*]], float [[TMP12:%.*]], float [[TMP13:%.*]]) #[[ATTR0:[0-9]+]] {
-; METADATA-NEXT:    [[DIS_DATA:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[DATA]], align 4
-; METADATA-NEXT:    [[SYS_DATA:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA]], 0
-; METADATA-NEXT:    [[TRAV_DATA:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA]], 0
-; METADATA-NEXT:    [[ADDR:%.*]] = call i32 @_AmdGetResumePointAddr() #[[ATTR3:[0-9]+]]
-; METADATA-NEXT:    [[TRAV_DATA2:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA]], i32 [[ADDR]], 5
-; METADATA-NEXT:    [[NEWDATA:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] @[[_AMDAWAITTRAVERSAL:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 4, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2]])
-; METADATA-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[NEWDATA]], ptr [[DATA]], align 4
-; METADATA-NEXT:    call void @_AmdRestoreSystemData(ptr [[DATA]])
-; METADATA-NEXT:    ret void
-;
-  %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4
-  %sys_data = insertvalue %struct.SystemData undef, %struct.DispatchSystemData %dis_data, 0
-  %trav_data = insertvalue %struct.TraversalData undef, %struct.SystemData %sys_data, 0
-  %addr = call i32 @_AmdGetResumePointAddr() #3
-  %trav_data2 = insertvalue %struct.TraversalData %trav_data, i32 %addr, 5
-  %newdata = call %struct.DispatchSystemData @_AmdAwaitTraversal(i32 4, %struct.TraversalData %trav_data2)
-  store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4
-  call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data)
-  ret void
-}
-
-define void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) #0 !pointeetys !45 {
-; METADATA-LABEL: define void @_cont_CallShader(
-; METADATA-SAME: ptr [[DATA:%.*]], i32 [[TMP0:%.*]]) #[[ATTR0]] {
-; METADATA-NEXT:    [[DIS_DATA:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[DATA]], align 4
-; METADATA-NEXT:    [[NEWDATA:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] @[[_AMDAWAITSHADER:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 2, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA]])
-; METADATA-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[NEWDATA]], ptr [[DATA]], align 4
-; METADATA-NEXT:    call void @_AmdRestoreSystemData(ptr [[DATA]])
-; METADATA-NEXT:    ret void
-;
-  %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4
-  %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i32 2, %struct.DispatchSystemData %dis_data)
-  store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4
-  call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data)
-  ret void
-}
-
-define i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind) #0 !pointeetys !46 {
-; METADATA-LABEL: define i1 @_cont_ReportHit(
-; METADATA-SAME: ptr [[DATA:%.*]], float [[T:%.*]], i32 [[HITKIND:%.*]]) #[[ATTR0]] {
-; METADATA-NEXT:    [[ORIGTPTR:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA:%.*]], ptr [[DATA]], i32 0, i32 0, i32 4
-; METADATA-NEXT:    [[ORIGT:%.*]] = load float, ptr [[ORIGTPTR]], align 4
-; METADATA-NEXT:    [[ISNOHIT:%.*]] = fcmp fast uge float [[T]], [[ORIGT]]
-; METADATA-NEXT:    br i1 [[ISNOHIT]], label [[ISEND:%.*]], label [[CALLAHIT:%.*]]
-; METADATA:       callAHit:
-; METADATA-NEXT:    [[TRAV_DATA:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[DATA]], align 4
-; METADATA-NEXT:    [[NEWDATA:%.*]] = call [[STRUCT_ANYHITTRAVERSALDATA]] @[[_AMDAWAITANYHIT:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 3, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA]], float [[T]], i32 [[HITKIND]])
-; METADATA-NEXT:    store [[STRUCT_ANYHITTRAVERSALDATA]] [[NEWDATA]], ptr [[DATA]], align 4
-; METADATA-NEXT:    call void @_AmdRestoreSystemDataAnyHit(ptr [[DATA]])
-; METADATA-NEXT:    ret i1 true
-; METADATA:       isEnd:
-; METADATA-NEXT:    call void @_AmdAcceptHitAttributes(ptr [[DATA]])
-; METADATA-NEXT:    ret i1 false
-;
-  %origTPtr = getelementptr inbounds %struct.AnyHitTraversalData, %struct.AnyHitTraversalData* %data, i32 0, i32 0, i32 4
-  %origT = load float, float* %origTPtr, align 4
-  %isNoHit = fcmp fast uge float %t, %origT
-  br i1 %isNoHit, label %isEnd, label %callAHit
-
-callAHit:                                         ; preds = %0
-  %trav_data = load %struct.AnyHitTraversalData, %struct.AnyHitTraversalData* %data, align 4
-  %newdata = call %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i32 3, %struct.AnyHitTraversalData %trav_data, float %t, i32 %hitKind)
-  store %struct.AnyHitTraversalData %newdata, %struct.AnyHitTraversalData* %data, align 4
-  call void @_AmdRestoreSystemDataAnyHit(%struct.AnyHitTraversalData* %data)
-  ret i1 true
-
-isEnd:                                            ; preds = %0
-  call void @_AmdAcceptHitAttributes(%struct.AnyHitTraversalData* %data)
-  ret i1 false
-}
-
-define i32 @_cont_DispatchRaysIndex(%struct.DispatchSystemData* %data, i32 %i) !pointeetys !47 {
-; METADATA-LABEL: define i32 @_cont_DispatchRaysIndex(
-; METADATA-SAME: ptr [[DATA:%.*]], i32 [[I:%.*]]) {
-; METADATA-NEXT:    [[RESPTR:%.*]] = getelementptr [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[DATA]], i32 0, i32 0, i32 [[I]]
-; METADATA-NEXT:    [[RES:%.*]] = load i32, ptr [[RESPTR]], align 4
-; METADATA-NEXT:    ret i32 [[RES]]
-;
-  %resPtr = getelementptr %struct.DispatchSystemData, %struct.DispatchSystemData* %data, i32 0, i32 0, i32 %i
-  %res = load i32, i32* %resPtr, align 4
-  ret i32 %res
-}
-
-define float @_cont_ObjectRayOrigin(%struct.DispatchSystemData* nocapture readnone %data, %struct.HitData* %hitData, i32 %i) !pointeetys !48 {
-; METADATA-LABEL: define float @_cont_ObjectRayOrigin(
-; METADATA-SAME: ptr nocapture readnone [[DATA:%.*]], ptr [[HITDATA:%.*]], i32 [[I:%.*]]) {
-; METADATA-NEXT:    [[RESPTR:%.*]] = getelementptr [[STRUCT_HITDATA:%.*]], ptr [[HITDATA]], i32 0, i32 0, i32 [[I]]
-; METADATA-NEXT:    [[RES:%.*]] = load float, ptr [[RESPTR]], align 4
-; METADATA-NEXT:    ret float [[RES]]
-;
-  %resPtr = getelementptr %struct.HitData, %struct.HitData* %hitData, i32 0, i32 0, i32 %i
-  %res = load float, float* %resPtr, align 4
-  ret float %res
-}
-
-define float @_cont_ObjectRayDirection(%struct.DispatchSystemData* nocapture readnone %data, %struct.HitData* %hitData, i32 %i) !pointeetys !48 {
-; METADATA-LABEL: define float @_cont_ObjectRayDirection(
-; METADATA-SAME: ptr nocapture readnone [[DATA:%.*]], ptr [[HITDATA:%.*]], i32 [[I:%.*]]) {
-; METADATA-NEXT:    [[RESPTR:%.*]] = getelementptr [[STRUCT_HITDATA:%.*]], ptr [[HITDATA]], i32 0, i32 1, i32 [[I]]
-; METADATA-NEXT:    [[RES:%.*]] = load float, ptr [[RESPTR]], align 4
-; METADATA-NEXT:    ret float [[RES]]
-;
-  %resPtr = getelementptr %struct.HitData, %struct.HitData* %hitData, i32 0, i32 1, i32 %i
-  %res = load float, float* %resPtr, align 4
-  ret float %res
-}
-
-define float @_cont_RayTCurrent(%struct.DispatchSystemData* nocapture readnone %data, %struct.HitData* %hitData) !pointeetys !50 {
-; METADATA-LABEL: define float @_cont_RayTCurrent(
-; METADATA-SAME: ptr nocapture readnone [[DATA:%.*]], ptr [[HITDATA:%.*]]) {
-; METADATA-NEXT:    [[RESPTR:%.*]] = getelementptr [[STRUCT_HITDATA:%.*]], ptr [[HITDATA]], i32 0, i32 2
-; METADATA-NEXT:    [[RES:%.*]] = load float, ptr [[RESPTR]], align 4
-; METADATA-NEXT:    ret float [[RES]]
-;
-  %resPtr = getelementptr %struct.HitData, %struct.HitData* %hitData, i32 0, i32 2
-  %res = load float, float* %resPtr, align 4
-  ret float %res
-}
-
-; Function Attrs: nounwind
-define void @MyRayGen() #2 {
-; METADATA: Function Attrs: nounwind
-; METADATA-LABEL: define void @MyRayGen(
-; METADATA-SAME: ) #[[ATTR2:[0-9]+]] {
-; METADATA-NEXT:    [[TMP1:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
-; METADATA-NEXT:    [[TMP2:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
-; METADATA-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 4
-; METADATA-NEXT:    [[TMP4:%.*]] = bitcast ptr [[TMP3]] to ptr
-; METADATA-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP4]]) #[[ATTR1:[0-9]+]]
-; METADATA-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0, i32 0
-; METADATA-NEXT:    store <4 x float> zeroinitializer, ptr [[TMP5]], align 4, !tbaa [[TBAA31:![0-9]+]]
-; METADATA-NEXT:    [[TMP6:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP1]])
-; METADATA-NEXT:    [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP6]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
-; METADATA-NEXT:    call void @dx.op.traceRay.struct.RayPayload(i32 157, [[DX_TYPES_HANDLE]] [[TMP7]], i32 16, i32 -1, i32 0, i32 1, i32 0, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0x3F50624DE0000000, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+04, ptr nonnull [[TMP3]])
-; METADATA-NEXT:    [[TMP8:%.*]] = load <4 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA31]]
-; METADATA-NEXT:    [[TMP9:%.*]] = call i32 @dx.op.dispatchRaysIndex.i32(i32 145, i8 0)
-; METADATA-NEXT:    [[TMP10:%.*]] = call i32 @dx.op.dispatchRaysIndex.i32(i32 145, i8 1)
-; METADATA-NEXT:    [[TMP11:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]])
-; METADATA-NEXT:    [[TMP12:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP11]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 })
-; METADATA-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP8]], i64 0
-; METADATA-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP8]], i64 1
-; METADATA-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP8]], i64 2
-; METADATA-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[TMP8]], i64 3
-; METADATA-NEXT:    call void @dx.op.textureStore.f32(i32 67, [[DX_TYPES_HANDLE]] [[TMP12]], i32 [[TMP9]], i32 [[TMP10]], i32 undef, float [[TMP13]], float [[TMP14]], float [[TMP15]], float [[TMP16]], i8 15)
-; METADATA-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[TMP4]]) #[[ATTR1]]
-; METADATA-NEXT:    ret void
-;
-  %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
-  %2 = load %dx.types.Handle, %dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
-  %3 = alloca %struct.RayPayload, align 4
-  %4 = bitcast %struct.RayPayload* %3 to i8*
-  call void @llvm.lifetime.start.p0i8(i64 16, i8* %4) #1
-  %5 = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %3, i32 0, i32 0
-  store <4 x float> zeroinitializer, <4 x float>* %5, align 4, !tbaa !51
-  %6 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1)
-  %7 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %6, %dx.types.ResourceProperties { i32 16, i32 0 })
-  call void @dx.op.traceRay.struct.RayPayload(i32 157, %dx.types.Handle %7, i32 16, i32 -1, i32 0, i32 1, i32 0, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0x3F50624DE0000000, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+04, %struct.RayPayload* nonnull %3)
-  %8 = load <4 x float>, <4 x float>* %5, align 4, !tbaa !51
-  %9 = call i32 @dx.op.dispatchRaysIndex.i32(i32 145, i8 0)
-  %10 = call i32 @dx.op.dispatchRaysIndex.i32(i32 145, i8 1)
-  %11 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %2)
-  %12 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %11, %dx.types.ResourceProperties { i32 4098, i32 1033 })
-  %13 = extractelement <4 x float> %8, i64 0
-  %14 = extractelement <4 x float> %8, i64 1
-  %15 = extractelement <4 x float> %8, i64 2
-  %16 = extractelement <4 x float> %8, i64 3
-  call void @dx.op.textureStore.f32(i32 67, %dx.types.Handle %12, i32 %9, i32 %10, i32 undef, float %13, float %14, float %15, float %16, i8 15)
-  call void @llvm.lifetime.end.p0i8(i64 16, i8* %4) #1
-  ret void
-}
-
-; Function Attrs: nounwind
-define void @MyClosestHitShader(%struct.RayPayload* noalias nocapture %payload, %struct.BuiltInTriangleIntersectionAttributes* nocapture readonly %attr) #2 !pointeetys !54 {
-; METADATA: Function Attrs: nounwind
-; METADATA-LABEL: define void @MyClosestHitShader(
-; METADATA-SAME: ptr noalias nocapture [[PAYLOAD:%.*]], ptr nocapture readonly [[ATTR:%.*]]) #[[ATTR2]] {
-; METADATA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], ptr [[ATTR]], i32 0, i32 0
-; METADATA-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[TMP1]], align 4
-; METADATA-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
-; METADATA-NEXT:    [[TMP4:%.*]] = fsub fast float 1.000000e+00, [[TMP3]]
-; METADATA-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
-; METADATA-NEXT:    [[TMP6:%.*]] = fsub fast float [[TMP4]], [[TMP5]]
-; METADATA-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> undef, float [[TMP6]], i64 0
-; METADATA-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP3]], i64 1
-; METADATA-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP5]], i64 2
-; METADATA-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float 1.000000e+00, i64 3
-; METADATA-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD:%.*]], ptr [[PAYLOAD]], i32 0, i32 0
-; METADATA-NEXT:    store <4 x float> [[TMP10]], ptr [[TMP11]], align 4
-; METADATA-NEXT:    ret void
-;
-  %1 = getelementptr inbounds %struct.BuiltInTriangleIntersectionAttributes, %struct.BuiltInTriangleIntersectionAttributes* %attr, i32 0, i32 0
-  %2 = load <2 x float>, <2 x float>* %1, align 4
-  %3 = extractelement <2 x float> %2, i32 0
-  %4 = fsub fast float 1.000000e+00, %3
-  %5 = extractelement <2 x float> %2, i32 1
-  %6 = fsub fast float %4, %5
-  %7 = insertelement <4 x float> undef, float %6, i64 0
-  %8 = insertelement <4 x float> %7, float %3, i64 1
-  %9 = insertelement <4 x float> %8, float %5, i64 2
-  %10 = insertelement <4 x float> %9, float 1.000000e+00, i64 3
-  %11 = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %payload, i32 0, i32 0
-  store <4 x float> %10, <4 x float>* %11, align 4
-  ret void
-}
-
-; Function Attrs: nounwind
-define void @MyAnyHitShader(%struct.RayPayload* noalias nocapture %payload, %struct.BuiltInTriangleIntersectionAttributes* nocapture readnone %attr) #2 !pointeetys !54 {
-; METADATA: Function Attrs: nounwind
-; METADATA-LABEL: define void @MyAnyHitShader(
-; METADATA-SAME: ptr noalias nocapture [[PAYLOAD:%.*]], ptr nocapture readnone [[ATTR:%.*]]) #[[ATTR2]] {
-; METADATA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD:%.*]], ptr [[PAYLOAD]], i32 0, i32 0
-; METADATA-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
-; METADATA-NEXT:    [[TMP3:%.*]] = call float @dx.op.objectRayOrigin.f32(i32 149, i8 0)
-; METADATA-NEXT:    [[TMP4:%.*]] = call float @dx.op.objectRayDirection.f32(i32 150, i8 0)
-; METADATA-NEXT:    [[TMP5:%.*]] = call float @dx.op.rayTCurrent.f32(i32 154)
-; METADATA-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP5]], [[TMP4]]
-; METADATA-NEXT:    [[TMP7:%.*]] = fadd fast float [[TMP6]], [[TMP3]]
-; METADATA-NEXT:    [[TMP8:%.*]] = fcmp fast ogt float [[TMP7]], 0.000000e+00
-; METADATA-NEXT:    [[TMP9:%.*]] = fcmp fast ogt float [[TMP7]], 1.000000e+00
-; METADATA-NEXT:    [[TMP10:%.*]] = fcmp fast ogt float [[TMP7]], -1.000000e+00
-; METADATA-NEXT:    br i1 [[TMP8]], label [[TMP11:%.*]], label [[TMP14:%.*]]
-; METADATA:       11:
-; METADATA-NEXT:    store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
-; METADATA-NEXT:    br i1 [[TMP9]], label [[TMP12:%.*]], label [[TMP13:%.*]]
-; METADATA:       12:
-; METADATA-NEXT:    call void @dx.op.acceptHitAndEndSearch(i32 156)
-; METADATA-NEXT:    unreachable
-; METADATA:       13:
-; METADATA-NEXT:    call void @dx.op.acceptHitAndEndSearch(i32 156)
-; METADATA-NEXT:    ret void
-; METADATA:       14:
-; METADATA-NEXT:    br i1 [[TMP10]], label [[TMP15:%.*]], label [[TMP18:%.*]]
-; METADATA:       15:
-; METADATA-NEXT:    br i1 [[TMP9]], label [[TMP16:%.*]], label [[TMP17:%.*]]
-; METADATA:       16:
-; METADATA-NEXT:    call void @dx.op.ignoreHit(i32 155)
-; METADATA-NEXT:    unreachable
-; METADATA:       17:
-; METADATA-NEXT:    call void @dx.op.ignoreHit(i32 155)
-; METADATA-NEXT:    ret void
-; METADATA:       18:
-; METADATA-NEXT:    store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
-; METADATA-NEXT:    ret void
-;
-  %1 = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %payload, i32 0, i32 0
-  %2 = load <4 x float>, <4 x float>* %1, align 4
-  %3 = call float @dx.op.objectRayOrigin.f32(i32 149, i8 0)
-  %4 = call float @dx.op.objectRayDirection.f32(i32 150, i8 0)
-  %5 = call float @dx.op.rayTCurrent.f32(i32 154)
-  %6 = fmul fast float %5, %4
-  %7 = fadd fast float %6, %3
-  %8 = fcmp fast ogt float %7, 0.000000e+00
-  %9 = fcmp fast ogt float %7, 1.000000e+00
-  %10 = fcmp fast ogt float %7, -1.000000e+00
-  br i1 %8, label %11, label %14
-
-11:                                               ; preds = %0
-  store <4 x float> %2, <4 x float>* %1, align 4
-  br i1 %9, label %12, label %13
-
-12:                                               ; preds = %11
-  call void @dx.op.acceptHitAndEndSearch(i32 156)
-  unreachable
-
-13:                                               ; preds = %11
-  call void @dx.op.acceptHitAndEndSearch(i32 156)
-  ret void
-
-14:                                               ; preds = %0
-  br i1 %10, label %15, label %18
-
-15:                                               ; preds = %14
-  br i1 %9, label %16, label %17
-
-16:                                               ; preds = %15
-  call void @dx.op.ignoreHit(i32 155)
-  unreachable
-
-17:                                               ; preds = %15
-  call void @dx.op.ignoreHit(i32 155)
-  ret void
-
-18:                                               ; preds = %14
-  store <4 x float> %2, <4 x float>* %1, align 4
-  ret void
-}
-
-; Function Attrs: nounwind
-define void @MyIntersectionShader() #2 {
-; METADATA: Function Attrs: nounwind
-; METADATA-LABEL: define void @MyIntersectionShader(
-; METADATA-SAME: ) #[[ATTR2]] {
-; METADATA-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 4
-; METADATA-NEXT:    [[TMP2:%.*]] = call float @dx.op.rayTCurrent.f32(i32 154)
-; METADATA-NEXT:    [[TMP3:%.*]] = bitcast ptr [[TMP1]] to ptr
-; METADATA-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[TMP3]]) #[[ATTR1]]
-; METADATA-NEXT:    [[TMP4:%.*]] = call i1 @dx.op.reportHit.struct.BuiltInTriangleIntersectionAttributes(i32 158, float [[TMP2]], i32 0, ptr nonnull [[TMP1]])
-; METADATA-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP3]]) #[[ATTR1]]
-; METADATA-NEXT:    ret void
-;
-  %1 = alloca %struct.BuiltInTriangleIntersectionAttributes, align 4
-  %2 = call float @dx.op.rayTCurrent.f32(i32 154)
-  %3 = bitcast %struct.BuiltInTriangleIntersectionAttributes* %1 to i8*
-  call void @llvm.lifetime.start.p0i8(i64 8, i8* %3) #1
-  %4 = call i1 @dx.op.reportHit.struct.BuiltInTriangleIntersectionAttributes(i32 158, float %2, i32 0, %struct.BuiltInTriangleIntersectionAttributes* nonnull %1)
-  call void @llvm.lifetime.end.p0i8(i64 8, i8* %3) #1
-  ret void
-}
-
-; Function Attrs: nounwind
-define void @MyMissShader(%struct.RayPayload* noalias nocapture %payload) #2 !pointeetys !57 {
-; METADATA: Function Attrs: nounwind
-; METADATA-LABEL: define void @MyMissShader(
-; METADATA-SAME: ptr noalias nocapture [[PAYLOAD:%.*]]) #[[ATTR2]] {
-; METADATA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD:%.*]], ptr [[PAYLOAD]], i32 0, i32 0
-; METADATA-NEXT:    store <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, ptr [[TMP1]], align 4
-; METADATA-NEXT:    ret void
-;
-  %1 = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %payload, i32 0, i32 0
-  store <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, <4 x float>* %1, align 4
-  ret void
-}
-
-; Function Attrs: nounwind
-declare !pointeetys !58 void @dx.op.traceRay.struct.RayPayload(i32, %dx.types.Handle, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, %struct.RayPayload*) #1
-
-; Function Attrs: nounwind
-declare void @dx.op.textureStore.f32(i32, %dx.types.Handle, i32, i32, i32, float, float, float, float, i8) #1
-
-; Function Attrs: nounwind memory(none)
-declare i32 @dx.op.dispatchRaysIndex.i32(i32, i8) #3
-
-; Function Attrs: nounwind memory(none)
-declare float @dx.op.objectRayDirection.f32(i32, i8) #3
-
-; Function Attrs: nounwind memory(none)
-declare float @dx.op.objectRayOrigin.f32(i32, i8) #3
-
-; Function Attrs: nounwind memory(read)
-declare float @dx.op.rayTCurrent.f32(i32) #4
-
-declare void @dx.op.acceptHitAndEndSearch(i32) #0
-
-declare void @dx.op.ignoreHit(i32) #0
-
-; Function Attrs: nounwind
-declare !pointeetys !59 i1 @dx.op.reportHit.struct.BuiltInTriangleIntersectionAttributes(i32, float, i32, %struct.BuiltInTriangleIntersectionAttributes*) #1
-
-; Function Attrs: nounwind memory(none)
-declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #3
-
-; Function Attrs: nounwind memory(read)
-declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.types.Handle) #4
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare !pointeetys !60 void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #5
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare !pointeetys !60 void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #5
-
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-attributes #2 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind memory(none) }
-attributes #4 = { nounwind memory(read) }
-attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
-
-!llvm.ident = !{!0}
-!dx.version = !{!1}
-!dx.valver = !{!1}
-!dx.shaderModel = !{!2}
-!dx.resources = !{!3}
-!dx.typeAnnotations = !{!10}
-!dx.entryPoints = !{!18, !20, !23, !25, !27, !29}
-
-!0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"}
-!1 = !{i32 1, i32 6}
-!2 = !{!"lib", i32 6, i32 6}
-!3 = !{!4, !7, null, null}
-!4 = !{!5}
-!5 = !{i32 0, %struct.RaytracingAccelerationStructure* bitcast (%dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A" to %struct.RaytracingAccelerationStructure*), !"Scene", i32 0, i32 0, i32 1, i32 16, i32 0, !6}
-!6 = !{i32 0, i32 4}
-!7 = !{!8}
-!8 = !{i32 0, %"class.RWTexture2D<vector<float, 4> >"* bitcast (%dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" to %"class.RWTexture2D<vector<float, 4> >"*), !"RenderTarget", i32 0, i32 0, i32 1, i32 2, i1 false, i1 false, i1 false, !9}
-!9 = !{i32 0, i32 9}
-!10 = !{i32 1, void ()* @MyRayGen, !11, void (%struct.RayPayload*, %struct.BuiltInTriangleIntersectionAttributes*)* @MyClosestHitShader, !14, void (%struct.RayPayload*, %struct.BuiltInTriangleIntersectionAttributes*)* @MyAnyHitShader, !14, void ()* @MyIntersectionShader, !11, void (%struct.RayPayload*)* @MyMissShader, !17}
-!11 = !{!12}
-!12 = !{i32 1, !13, !13}
-!13 = !{}
-!14 = !{!12, !15, !16}
-!15 = !{i32 2, !13, !13}
-!16 = !{i32 0, !13, !13}
-!17 = !{!12, !15}
-!18 = !{null, !"", null, !3, !19}
-!19 = !{i32 0, i64 65536}
-!20 = !{void (%struct.RayPayload*, %struct.BuiltInTriangleIntersectionAttributes*)* @MyAnyHitShader, !"MyAnyHitShader", null, null, !21}
-!21 = !{i32 8, i32 9, i32 6, i32 16, i32 7, i32 8, i32 5, !22}
-!22 = !{i32 0}
-!23 = !{void (%struct.RayPayload*, %struct.BuiltInTriangleIntersectionAttributes*)* @MyClosestHitShader, !"MyClosestHitShader", null, null, !24}
-!24 = !{i32 8, i32 10, i32 6, i32 16, i32 7, i32 8, i32 5, !22}
-!25 = !{void ()* @MyIntersectionShader, !"MyIntersectionShader", null, null, !26}
-!26 = !{i32 8, i32 8, i32 5, !22}
-!27 = !{void (%struct.RayPayload*)* @MyMissShader, !"MyMissShader", null, null, !28}
-!28 = !{i32 8, i32 11, i32 6, i32 16, i32 5, !22}
-!29 = !{void ()* @MyRayGen, !"MyRayGen", null, null, !30}
-!30 = !{i32 8, i32 7, i32 5, !22}
-!31 = !{%struct.AnyHitTraversalData poison}
-!32 = !{i32 0, %struct.AnyHitTraversalData poison}
-!33 = !{%struct.SystemData poison}
-!34 = !{i32 0, %struct.SystemData poison}
-!35 = !{%struct.SystemData poison}
-!36 = !{%struct.SystemData poison}
-!37 = !{%struct.DispatchSystemData poison}
-!38 = !{i32 0, %struct.DispatchSystemData poison}
-!39 = !{%struct.TraversalData poison}
-!40 = !{i32 0, %struct.TraversalData poison}
-!41 = !{%struct.SystemData poison}
-!42 = !{%struct.DispatchSystemData poison}
-!43 = !{%struct.AnyHitTraversalData poison}
-!44 = !{%struct.DispatchSystemData poison}
-!45 = !{%struct.DispatchSystemData poison}
-!46 = !{%struct.AnyHitTraversalData poison}
-!47 = !{%struct.DispatchSystemData poison}
-!48 = !{null, %struct.DispatchSystemData poison, %struct.HitData poison}
-!49 = !{i32 0, %struct.HitData poison}
-!50 = !{null, %struct.DispatchSystemData poison, %struct.HitData poison}
-!51 = !{!52, !52, i64 0}
-!52 = !{!"omnipotent char", !53, i64 0}
-!53 = !{!"Simple C/C++ TBAA"}
-!54 = !{null, %struct.RayPayload poison, %struct.BuiltInTriangleIntersectionAttributes poison}
-!55 = !{i32 0, %struct.RayPayload poison}
-!56 = !{i32 0, %struct.BuiltInTriangleIntersectionAttributes poison}
-!57 = !{%struct.RayPayload poison}
-!58 = !{%struct.RayPayload poison}
-!59 = !{%struct.BuiltInTriangleIntersectionAttributes poison}
-!60 = !{i8 poison}
-;.
-; METADATA: attributes #[[ATTR0]] = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
-; METADATA: attributes #[[ATTR1]] = { nounwind }
-; METADATA: attributes #[[ATTR2]] = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
-; METADATA: attributes #[[ATTR3]] = { nounwind memory(none) }
-; METADATA: attributes #[[ATTR4:[0-9]+]] = { nounwind memory(read) }
-; METADATA: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
-;.
-; METADATA: [[META0:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; METADATA: [[META1:![0-9]+]] = !{i32 1, i32 6}
-; METADATA: [[META2:![0-9]+]] = !{!"lib", i32 6, i32 6}
-; METADATA: [[META3:![0-9]+]] = !{[[META4:![0-9]+]], [[META7:![0-9]+]], null, null}
-; METADATA: [[META4]] = !{[[META5:![0-9]+]]}
-; METADATA: [[META5]] = !{i32 0, ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", !"Scene", i32 0, i32 0, i32 1, i32 16, i32 0, [[META6:![0-9]+]]}
-; METADATA: [[META6]] = !{i32 0, i32 4}
-; METADATA: [[META7]] = !{[[META8:![0-9]+]]}
-; METADATA: [[META8]] = !{i32 0, ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", !"RenderTarget", i32 0, i32 0, i32 1, i32 2, i1 false, i1 false, i1 false, [[META9:![0-9]+]]}
-; METADATA: [[META9]] = !{i32 0, i32 9}
-; METADATA: [[META10:![0-9]+]] = !{i32 1, ptr @MyRayGen, [[META11:![0-9]+]], ptr @MyClosestHitShader, [[META14:![0-9]+]], ptr @MyAnyHitShader, [[META14]], ptr @MyIntersectionShader, [[META11]], ptr @MyMissShader, [[META17:![0-9]+]]}
-; METADATA: [[META11]] = !{[[META12:![0-9]+]]}
-; METADATA: [[META12]] = !{i32 1, [[META13:![0-9]+]], [[META13]]}
-; METADATA: [[META13]] = !{}
-; METADATA: [[META14]] = !{[[META12]], [[META15:![0-9]+]], [[META16:![0-9]+]]}
-; METADATA: [[META15]] = !{i32 2, [[META13]], [[META13]]}
-; METADATA: [[META16]] = !{i32 0, [[META13]], [[META13]]}
-; METADATA: [[META17]] = !{[[META12]], [[META15]]}
-; METADATA: [[META18:![0-9]+]] = !{null, !"", null, [[META3]], [[META19:![0-9]+]]}
-; METADATA: [[META19]] = !{i32 0, i64 65536}
-; METADATA: [[META20:![0-9]+]] = !{ptr @MyAnyHitShader, !"MyAnyHitShader", null, null, [[META21:![0-9]+]]}
-; METADATA: [[META21]] = !{i32 8, i32 9, i32 6, i32 16, i32 7, i32 8, i32 5, [[META22:![0-9]+]]}
-; METADATA: [[META22]] = !{i32 0}
-; METADATA: [[META23:![0-9]+]] = !{ptr @MyClosestHitShader, !"MyClosestHitShader", null, null, [[META24:![0-9]+]]}
-; METADATA: [[META24]] = !{i32 8, i32 10, i32 6, i32 16, i32 7, i32 8, i32 5, [[META22]]}
-; METADATA: [[META25:![0-9]+]] = !{ptr @MyIntersectionShader, !"MyIntersectionShader", null, null, [[META26:![0-9]+]]}
-; METADATA: [[META26]] = !{i32 8, i32 8, i32 5, [[META22]]}
-; METADATA: [[META27:![0-9]+]] = !{ptr @MyMissShader, !"MyMissShader", null, null, [[META28:![0-9]+]]}
-; METADATA: [[META28]] = !{i32 8, i32 11, i32 6, i32 16, i32 5, [[META22]]}
-; METADATA: [[META29:![0-9]+]] = !{ptr @MyRayGen, !"MyRayGen", null, null, [[META30:![0-9]+]]}
-; METADATA: [[META30]] = !{i32 8, i32 7, i32 5, [[META22]]}
-; METADATA: [[TBAA31]] = !{[[META32:![0-9]+]], [[META32]], i64 0}
-; METADATA: [[META32]] = !{!"omnipotent char", [[META33:![0-9]+]], i64 0}
-; METADATA: [[META33]] = !{!"Simple C/C++ TBAA"}
-;.
diff --git a/llvmraytracing/test/dx/remove-unused-declarations.ll b/llvmraytracing/test/dx/remove-unused-declarations.ll
deleted file mode 100644
index 3667dee5bf..0000000000
--- a/llvmraytracing/test/dx/remove-unused-declarations.ll
+++ /dev/null
@@ -1,183 +0,0 @@
-; NOTE: Do not autogenerate
-; RUN: opt --verify-each -passes='dxil-cont-prepare-gpurt-library,lint,dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint' -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE-DECL %s
-; RUN: opt --verify-each -passes='dxil-cont-prepare-gpurt-library,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,cleanup-continuations,lint,dxil-cont-post-process,lint' -S %s --lint-abort-on-error | FileCheck -check-prefix=DXILCONTPOSTPROCESS-DECL %s
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-%struct.HitData = type { float, i32 }
-%struct.DispatchSystemData = type { <3 x i32> }
-%struct.SystemData = type { %struct.DispatchSystemData }
-%struct.TraversalData = type { %struct.SystemData, <3 x float>, <3 x float>, float }
-%struct.AnyHitTraversalData = type { %struct.TraversalData, %struct.HitData }
-
-declare i64 @_cont_GetTraversalAddr() #4
-declare i32 @_cont_GetContinuationStackAddr() #4
-declare !pointeetys !31 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.DispatchSystemData*) #4
-declare !pointeetys !32 void @_cont_SetTriangleHitAttributes(%struct.SystemData*, %struct.BuiltInTriangleIntersectionAttributes) #4
-declare %struct.DispatchSystemData @_cont_Traversal(%struct.TraversalData) #4
-declare !pointeetys !33 %struct.HitData @_cont_GetCandidateState(%struct.AnyHitTraversalData*) #4
-declare !pointeetys !34 %struct.HitData @_cont_GetCommittedState(%struct.DispatchSystemData*) #4
-declare !pointeetys !53 i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind)
-
-define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) #4 !pointeetys !37 {
-  ret i32 5
-}
-
-define void @_cont_TraceRay(%struct.DispatchSystemData* %data, i64, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float) #4 !pointeetys !38 {
-  %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data
-  %sys_data = insertvalue %struct.SystemData undef, %struct.DispatchSystemData %dis_data, 0
-  %trav_data = insertvalue %struct.TraversalData undef, %struct.SystemData %sys_data, 0
-  %newdata = call %struct.DispatchSystemData @_cont_Traversal(%struct.TraversalData %trav_data)
-  store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data
-  ret void
-}
-
-; Function Attrs: nounwind readnone
-declare !pointeetys !40 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData* nocapture readnone %data) #2
-declare !pointeetys !40 <3 x i32> @_cont_DispatchRaysDimensions3(%struct.DispatchSystemData* nocapture readnone %data) #2
-declare !pointeetys !41 <3 x float> @_cont_WorldRayOrigin3(%struct.DispatchSystemData* nocapture readnone %data) #2
-declare !pointeetys !41 <3 x float> @_cont_WorldRayDirection3(%struct.DispatchSystemData* nocapture readnone %data) #2
-declare !pointeetys !42 float @_cont_RayTMin(%struct.DispatchSystemData* nocapture readnone %data) #2
-declare !pointeetys !43 float @_cont_RayTCurrent(%struct.DispatchSystemData* nocapture readnone %data, %struct.HitData*) #1
-declare !pointeetys !51 i32 @_cont_RayFlags(%struct.DispatchSystemData* nocapture readnone %data) #2
-declare !pointeetys !52 i32 @_cont_InstanceIndex(%struct.DispatchSystemData* nocapture readnone %data, %struct.HitData*) #2
-declare !pointeetys !52 i32 @_cont_InstanceID(%struct.DispatchSystemData* nocapture readnone %data, %struct.HitData*) #2
-declare !pointeetys !52 i32 @_cont_PrimitiveIndex(%struct.DispatchSystemData* nocapture readnone %data, %struct.HitData*) #2
-declare !pointeetys !46 <3 x float> @_cont_ObjectRayOrigin3(%struct.DispatchSystemData* nocapture readnone %data, %struct.HitData*) #2
-declare !pointeetys !46 <3 x float> @_cont_ObjectRayDirection3(%struct.DispatchSystemData* nocapture readnone %data, %struct.HitData*) #2
-declare !pointeetys !47 [4 x <3 x float>] @_cont_ObjectToWorld4x3(%struct.DispatchSystemData* nocapture readnone %data, %struct.HitData*) #2
-declare !pointeetys !47 [4 x <3 x float>] @_cont_WorldToObject4x3(%struct.DispatchSystemData* nocapture readnone %data, %struct.HitData*) #2
-declare !pointeetys !45 i32 @_cont_HitKind(%struct.SystemData* nocapture readnone %data, %struct.HitData*) #2
-
-%dx.types.Handle = type { i8* }
-%struct.RaytracingAccelerationStructure = type { i32 }
-%"class.RWTexture2D<vector<float, 4> >" = type { <4 x float> }
-%struct.RayPayload = type { float, float, i32, i32 }
-%struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
-
-@"\01?Scene@@3URaytracingAccelerationStructure@@A" = external constant %dx.types.Handle, align 4
-@"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" = external constant %dx.types.Handle, align 4
-
-; Function Attrs: nounwind
-define void @ClosestHit(%struct.RayPayload*, %struct.BuiltInTriangleIntersectionAttributes*) #0 !pointeetys !48 {
-  %a = call i32 @dx.op.dispatchRaysIndex.i32(i32 145, i8 0)  ; DispatchRaysIndex(col)
-  %b = call i32 @dx.op.dispatchRaysDimensions.i32(i32 146, i8 0)  ; DispatchRaysDimensions(col)
-  %c = call float @dx.op.worldRayOrigin.f32(i32 147, i8 0)  ; WorldRayOrigin(col)
-  %d = call float @dx.op.worldRayDirection.f32(i32 148, i8 0)  ; WorldRayDirection(col)
-  %e = call float @dx.op.rayTMin.f32(i32 153)  ; RayTMin()
-  %f = call float @dx.op.rayTCurrent.f32(i32 154)  ; RayTCurrent()
-  %g = call i32 @dx.op.rayFlags.i32(i32 144)  ; RayFlags()
-  %h = call i32 @dx.op.instanceIndex.i32(i32 142)  ; InstanceIndex()
-  %i = call i32 @dx.op.instanceID.i32(i32 141)  ; InstanceID()
-  %j = call i32 @dx.op.primitiveIndex.i32(i32 161)  ; PrimitiveIndex()
-  %k = call float @dx.op.objectRayOrigin.f32(i32 149, i8 0)  ; ObjectRayOrigin(col)
-  %l = call float @dx.op.objectRayDirection.f32(i32 150, i8 0)  ; ObjectRayDirection(col)
-  %m = call float @dx.op.objectToWorld.f32(i32 151, i32 0, i8 0)  ; ObjectToWorld(row,col)
-  %n = call float @dx.op.worldToObject.f32(i32 152, i32 0, i8 0)  ; WorldToObject(row,col)
-  %o = call i32 @dx.op.hitKind.i32(i32 143)  ; HitKind()
-  ret void
-}
-
-; LOWERRAYTRACINGPIPELINE-DECL-NOT: declare i32 @dx.op.dispatchRaysDimensions.i32(
-; LOWERRAYTRACINGPIPELINE-DECL-NOT: declare i32 @dx.op.dispatchRaysIndex.i32(
-; LOWERRAYTRACINGPIPELINE-DECL-NOT: declare i32 @dx.op.objectRayDirection.f32(
-; LOWERRAYTRACINGPIPELINE-DECL-NOT: declare i32 @dx.op.objectRayOrigin.f32(
-; LOWERRAYTRACINGPIPELINE-DECL-NOT: declare i32 @dx.op.worldRayDirection.f32(
-; LOWERRAYTRACINGPIPELINE-DECL-NOT: declare i32 @dx.op.worldRayOrigin.f32(
-; LOWERRAYTRACINGPIPELINE-DECL-NOT: declare i32 @dx.op.rayTCurrent.f32(
-; LOWERRAYTRACINGPIPELINE-DECL-NOT: declare i32 @dx.op.rayTMin.f32(
-; LOWERRAYTRACINGPIPELINE-DECL-NOT: declare i32 @dx.op.hitKind.i32(
-; LOWERRAYTRACINGPIPELINE-DECL-NOT: declare i32 @dx.op.primitiveIndex.i32(
-; LOWERRAYTRACINGPIPELINE-DECL-NOT: declare i32 @dx.op.instanceID.i32(
-; LOWERRAYTRACINGPIPELINE-DECL-NOT: declare i32 @dx.op.instanceIndex.i32(
-; LOWERRAYTRACINGPIPELINE-DECL-NOT: declare i32 @dx.op.rayFlags.i32(
-; LOWERRAYTRACINGPIPELINE-DECL-NOT: declare i32 @dx.op.worldToObject.f32(
-; LOWERRAYTRACINGPIPELINE-DECL-NOT: declare i32 @dx.op.objectToWorld.f32(
-; LOWERRAYTRACINGPIPELINE-DECL: declare <3 x i32> @lgc.rt.dispatch.rays.dimensions(
-; LOWERRAYTRACINGPIPELINE-DECL: declare <3 x i32> @lgc.rt.dispatch.rays.index(
-; LOWERRAYTRACINGPIPELINE-DECL-NOT: declare <3 x float> @lgc.rt.object.ray.direction(
-; LOWERRAYTRACINGPIPELINE-DECL-NOT: declare <3 x float> @lgc.rt.object.ray.origin(
-; LOWERRAYTRACINGPIPELINE-DECL-NOT: declare <3 x float> @lgc.rt.world.ray.direction(
-; LOWERRAYTRACINGPIPELINE-DECL-NOT: declare <3 x float> @lgc.rt.world.ray.origin(
-; LOWERRAYTRACINGPIPELINE-DECL-NOT: declare float @lgc.rt.ray.tmin(
-; LOWERRAYTRACINGPIPELINE-DECL-NOT: declare i32 @lgc.rt.instance.id(
-; LOWERRAYTRACINGPIPELINE-DECL-NOT: declare float @lgc.rt.ray.tcurrent(
-; LOWERRAYTRACINGPIPELINE-DECL-NOT: declare i32 @lgc.rt.hit.kind(
-; LOWERRAYTRACINGPIPELINE-DECL-NOT: declare i32 @lgc.rt.primitive.index(
-; LOWERRAYTRACINGPIPELINE-DECL-NOT: declare i32 @lgc.rt.instance.index(
-; LOWERRAYTRACINGPIPELINE-DECL-NOT: declare i32 @lgc.rt.ray.flags(
-; LOWERRAYTRACINGPIPELINE-DECL-NOT: declare [4 x <3 x float>] @lgc.rt.object.to.world(
-; LOWERRAYTRACINGPIPELINE-DECL-NOT: declare [4 x <3 x float>] @lgc.rt.world.to.object(
-; DXILCONTPOSTPROCESS-DECL-NOT: declare <3 x i32> @lgc.rt.dispatch.rays.dimensions(
-; DXILCONTPOSTPROCESS-DECL-NOT: declare <3 x i32> @lgc.rt.dispatch.rays.index(
-declare i32 @dx.op.dispatchRaysDimensions.i32(i32, i8) #2
-declare i32 @dx.op.dispatchRaysIndex.i32(i32, i8) #2
-declare float @dx.op.objectRayDirection.f32(i32, i8) #2
-declare float @dx.op.objectRayOrigin.f32(i32, i8) #2
-declare float @dx.op.worldRayDirection.f32(i32, i8) #2
-declare float @dx.op.worldRayOrigin.f32(i32, i8) #2
-declare float @dx.op.rayTCurrent.f32(i32) #1
-declare float @dx.op.rayTMin.f32(i32) #2
-declare i32 @dx.op.hitKind.i32(i32) #2
-declare i32 @dx.op.primitiveIndex.i32(i32) #2
-declare i32 @dx.op.instanceID.i32(i32) #2
-declare i32 @dx.op.instanceIndex.i32(i32) #2
-declare i32 @dx.op.rayFlags.i32(i32) #2
-declare float @dx.op.worldToObject.f32(i32, i32, i8) #2
-declare float @dx.op.objectToWorld.f32(i32, i32, i8) #2
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readonly }
-attributes #2 = { nounwind readnone }
-attributes #4 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!llvm.ident = !{!0}
-!dx.version = !{!1}
-!dx.valver = !{!1}
-!dx.shaderModel = !{!2}
-!dx.resources = !{!3}
-!dx.typeAnnotations = !{!10}
-!dx.entryPoints = !{!18, !29}
-
-!0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"}
-!1 = !{i32 1, i32 6}
-!2 = !{!"lib", i32 6, i32 6}
-!3 = !{!4, !7, null, null}
-!4 = !{!5}
-!5 = !{i32 0, %struct.RaytracingAccelerationStructure* bitcast (%dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A" to %struct.RaytracingAccelerationStructure*), !"Scene", i32 0, i32 0, i32 1, i32 16, i32 0, !6}
-!6 = !{i32 0, i32 4}
-!7 = !{!8}
-!8 = !{i32 0, %"class.RWTexture2D<vector<float, 4> >"* bitcast (%dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" to %"class.RWTexture2D<vector<float, 4> >"*), !"RenderTarget", i32 0, i32 0, i32 1, i32 2, i1 false, i1 false, i1 false, !9}
-!9 = !{i32 0, i32 9}
-!10 = !{i32 1, void (%struct.RayPayload*, %struct.BuiltInTriangleIntersectionAttributes*)* @ClosestHit, !11}
-!11 = !{!12}
-!12 = !{i32 1, !13, !13}
-!13 = !{}
-!18 = !{null, !"", null, !3, !19}
-!19 = !{i32 0, i64 65536}
-!22 = !{i32 0}
-!29 = !{void (%struct.RayPayload*, %struct.BuiltInTriangleIntersectionAttributes*)* @ClosestHit, !"ClosestHit", null, null, !30}
-!30 = !{i32 8, i32 10, i32 5, !22}
-!31 = !{%struct.DispatchSystemData poison}
-!32 = !{%struct.SystemData poison}
-!33 = !{%struct.AnyHitTraversalData poison}
-!34 = !{%struct.DispatchSystemData poison}
-!35 = !{i32 0, %struct.AnyHitTraversalData poison}
-!36 = !{i32 0, %struct.SystemData poison}
-!37 = !{%struct.DispatchSystemData poison}
-!38 = !{%struct.DispatchSystemData poison}
-!39 = !{i32 0, %struct.DispatchSystemData poison}
-!40 = !{%struct.DispatchSystemData poison}
-!41 = !{%struct.DispatchSystemData poison}
-!42 = !{%struct.DispatchSystemData poison}
-!43 = !{null, %struct.DispatchSystemData poison, %struct.HitData poison}
-!44 = !{i32 0, %struct.HitData poison}
-!45 = !{null, %struct.SystemData poison, %struct.HitData poison}
-!46 = !{null, %struct.DispatchSystemData poison, %struct.HitData poison}
-!47 = !{null, %struct.DispatchSystemData poison, %struct.HitData poison}
-!48 = !{null, %struct.RayPayload poison, %struct.BuiltInTriangleIntersectionAttributes poison}
-!49 = !{i32 0, %struct.RayPayload poison}
-!50 = !{i32 0, %struct.BuiltInTriangleIntersectionAttributes poison}
-!51 = !{%struct.DispatchSystemData poison}
-!52 = !{null, %struct.DispatchSystemData poison, %struct.HitData poison}
-!53 = !{%struct.AnyHitTraversalData poison}
diff --git a/llvmraytracing/test/dx/specialize-driver-shaders/analysis.ll b/llvmraytracing/test/dx/specialize-driver-shaders/analysis.ll
deleted file mode 100644
index f372f2da9e..0000000000
--- a/llvmraytracing/test/dx/specialize-driver-shaders/analysis.ll
+++ /dev/null
@@ -1,469 +0,0 @@
-; NOTE: Do not autogenerate
-; RUN: opt --verify-each -passes='specialize-driver-shaders' -S %s -debug-only='specialize-driver-shaders' 2>&1 | FileCheck %s
-;
-; REQUIRES: assertions
-
-; Intentionally align i64 to 64 bits so we can test analysis of args that contain padding in memory,
-; where the in-register layout in the calling convention does not match the memory layout.
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-; This tests focuses on the preserved-argument analysis for different shader kinds, including await handling.
-; For that, we don't care about specific argument details, and thus use an i32 array most of the time.
-%args.type = type { [31 x i32] }
-; Awaits wrap args into a struct, even if it is just a single one
-%awaited.args.type = type { %args.type }
-%args.with.padding = type { i32, i64, { i32, i64 } }
-
-; Ignored prefix args: shaderAddr, levels, state, returnAddr, shaderRecIdx
-declare void @lgc.cps.jump(...)
-; Ignored prefix args: shaderAddr, levels, shaderRecIdx
-; The __ suffix is required to let the dialect visitor detect this as an overload of the await op.
-declare %awaited.args.type @lgc.cps.await__(...)
-declare %args.with.padding @lgc.cps.await__p(...)
-declare { <2 x i16> } @lgc.cps.await__2xi16(...)
-declare { i16, i16 } @lgc.cps.await__i16i16(...)
-declare { i32 } @lgc.cps.await__i32(...)
-
-; Legacy await:
-declare %awaited.args.type @await(...)
-declare %args.type @opaque(...)
-
-; Simple AHS that just forwards args
-; CHECK-LABEL: [SDS] Analyzing function AnyHit1
-define void @AnyHit1(i32, i32, %args.type %args) !lgc.rt.shaderstage !2 {
-; CHECK-NEXT: [SDS] Analyzed outgoing call {{.*}} @lgc.cps.jump({{.*}} %args)
-; CHECK-NEXT: [SDS] 0         1         2         3{{$}}
-; CHECK-NEXT: [SDS] 0123456789012345678901234567890{{$}}
-; CHECK-NEXT: [SDS] PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP{{$}}
-  call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, %args.type %args)
-  unreachable
-; CHECK-NEXT: [SDS] Finished analysis of function AnyHit1
-; CHECK-NEXT: [SDS] 0         1         2         3{{$}}
-; CHECK-NEXT: [SDS] 0123456789012345678901234567890{{$}}
-; CHECK-NEXT: [SDS] PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP{{$}}
-}
-
-; Single-jump AHS that:
-;  * swaps the first two dwords (dynamic)
-;  * writes constant to dword 10 (constant)
-;  * writes poison to dword 11 (undef)
-;  * condiditionally writes constant to dword 20 (constant)
-;  * condiditionally writes undef to dword 21 (undef)
-;  * condiditionally writes undef or constant to dword 22 (constant)
-;  * condiditionally writes constant or dynamic to dword 23 (dynamic)
-;  * writes same constants to dword 25 (constant)
-;  * writes different constants to dword 26 (dynamic)
-; CHECK-LABEL: [SDS] Analyzing function AnyHit2
-define void @AnyHit2(i32, i32, %args.type %args) !lgc.rt.shaderstage !2 {
-entry:
-  %dw0 = extractvalue %args.type %args, 0, 0
-  %dw1 = extractvalue %args.type %args, 0, 1
-  %tmp0 = insertvalue %args.type %args, i32 %dw1, 0, 0
-  %tmp1 = insertvalue %args.type %tmp0, i32 %dw0, 0, 1
-  %tmp2 = insertvalue %args.type %tmp1, i32 -1, 0, 10
-  %tmp3 = insertvalue %args.type %tmp2, i32 poison, 0, 11
-  %tmp4 = insertvalue %args.type %tmp3, i32 undef, 0, 22
-  %dynamic = load i32, ptr null
-  %tmp5 = insertvalue %args.type %tmp4, i32 %dynamic, 0, 23
-  %tmp6 = insertvalue %args.type %tmp5, i32 -1, 0, 25
-  %tmp7 = insertvalue %args.type %tmp6, i32 0, 0, 26
-  %cond = trunc i32 %dw0 to i1
-  br i1 %cond, label %conditional, label %exit
-conditional:
-  %tmp8 = insertvalue %args.type %tmp7, i32 0, 0, 20
-  %tmp9 = insertvalue %args.type %tmp8, i32 undef, 0, 21
-  %tmp10 = insertvalue %args.type %tmp9, i32 -1, 0, 22
-  %tmp11 = insertvalue %args.type %tmp10, i32 -1, 0, 23
-  %tmp12 = insertvalue %args.type %tmp11, i32 -1, 0, 25
-  %tmp13 = insertvalue %args.type %tmp12, i32 -1, 0, 26
-  br label %exit
-exit:
-  %args.final = phi %args.type [ %tmp13, %conditional ], [ %tmp7, %entry ]
-; CHECK-NEXT: [SDS] Analyzed outgoing call {{.*}} @lgc.cps.jump({{.*}} %args.final)
-; CHECK-NEXT: [SDS] 0         1         2         3{{$}}
-; CHECK-NEXT: [SDS] 0123456789012345678901234567890{{$}}
-; CHECK-NEXT: [SDS] DDPPPPPPPPCUPPPPPPPPCUCDPCDPPPP{{$}}
-  call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, %args.type %args.final)
-  unreachable
-}
-
-; Two-jump AHS that does different things on the two jumps, testing merging of states
-; across jumps works correctly:
-;  * write constant to dword 0 only on Jump0
-;  * write constant to dword 1 only on Jump1
-;  * write matching constants to dword 2
-; CHECK-LABEL: [SDS] Analyzing function AnyHit3
-;  * write non-matching constants to dword 3
-define void @AnyHit3(i32, i32, %args.type %args) !lgc.rt.shaderstage !2 {
-entry:
-  %dw0 = extractvalue %args.type %args, 0, 0
-  %cond = trunc i32 %dw0 to i1
-  br i1 %cond, label %exit0, label %exit1
-exit0:
-  %tmp0 = insertvalue %args.type %args, i32 -1, 0, 0
-  %tmp1 = insertvalue %args.type %tmp0, i32 -1, 0, 2
-  %tmp2 = insertvalue %args.type %tmp1, i32 -1, 0, 3
-  call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, %args.type %tmp2)
-  unreachable
-exit1:
-  %tmp3 = insertvalue %args.type %args, i32 -1, 0, 1
-  %tmp4 = insertvalue %args.type %tmp3, i32 -1, 0, 2
-  %tmp5 = insertvalue %args.type %tmp4, i32 -2, 0, 3
-  call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, %args.type %tmp5)
-  unreachable
-; CHECK:      [SDS] Finished analysis of function AnyHit3
-; CHECK-NEXT: [SDS] 0         1         2         3{{$}}
-; CHECK-NEXT: [SDS] 0123456789012345678901234567890{{$}}
-; CHECK-NEXT: [SDS] CCCDPPPPPPPPPPPPPPPPPPPPPPPPPPP{{$}}
-}
-
-; Intersection with an await call simulating a ReportHit call.
-; Check that values passed to await are checked and accounted for in the preserved state,
-; and that using values returned from await counts as preserved.
-; Also check that using original argument values in awaits after awaits still count as preserved.
-; Note: This is only possible because we run before coro passes, after coro passes such values
-; would be loaded from continuation state and their origin unknown.
-; This uses lgc.cps.await.
-; CHECK-LABEL: [SDS] Analyzing function Intersection1
-define void @Intersection1(i32, i32, %args.type %args) !lgc.rt.shaderstage !1 {
-entry:
-  %dw0 = extractvalue %args.type %args, 0, 0
-  %cond = trunc i32 %dw0 to i1
-  br i1 %cond, label %conditional, label %exit
-conditional:
-; Pass through args, trivially all-preserve
-; CHECK-NEXT: [SDS] Analyzed outgoing call   %awaited.0.struct {{.*}}lgc.cps.await{{.*}} %args)
-; CHECK-NEXT: [SDS] 0         1         2         3{{$}}
-; CHECK-NEXT: [SDS] 0123456789012345678901234567890{{$}}
-; CHECK-NEXT: [SDS] PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP{{$}}
-  %awaited.0.struct = call %awaited.args.type (...) @lgc.cps.await__(i32 poison, i32 poison, i32 poison, %args.type %args)
-  %awaited.0 = extractvalue %awaited.args.type %awaited.0.struct, 0
-; Pass awaited results. Should still be all-preserve. This tests awaited results are correctly handled.
-; CHECK-NEXT: [SDS] Analyzed outgoing call   %awaited.1.struct {{.*}}lgc.cps.await{{.*}} %awaited.0)
-; CHECK-NEXT: [SDS] 0         1         2         3{{$}}
-; CHECK-NEXT: [SDS] 0123456789012345678901234567890{{$}}
-; CHECK-NEXT: [SDS] PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP{{$}}
-  %awaited.1.struct = call %awaited.args.type (...) @lgc.cps.await__(i32 poison, i32 poison, i32 poison, %args.type %awaited.0)
-  %awaited.1 = extractvalue %awaited.args.type %awaited.1.struct, 0
-  %awaited.merged = insertvalue %args.type %awaited.1, i32 %dw0, 0, 0
-; Reuse incoming dword 0. Should still be preserved.
-; CHECK-NEXT: [SDS] Analyzed outgoing call   %awaited.2.struct {{.*}}lgc.cps.await{{.*}} %awaited.merged)
-; CHECK-NEXT: [SDS] 0         1         2         3{{$}}
-; CHECK-NEXT: [SDS] 0123456789012345678901234567890{{$}}
-; CHECK-NEXT: [SDS] PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP{{$}}
-  %awaited.2.struct = call %awaited.args.type (...) @lgc.cps.await__(i32 poison, i32 poison, i32 poison, %args.type %awaited.merged)
-  %awaited.2 = extractvalue %awaited.args.type %awaited.2.struct, 0
-  br label %exit
-exit:
-  %args.final = phi %args.type [ %awaited.2, %conditional ], [ %args, %entry ]
-; CHECK-NEXT: [SDS] Analyzed outgoing call {{.*}} @lgc.cps.jump({{.*}} %args.final)
-; CHECK-NEXT: [SDS] 0         1         2         3{{$}}
-; CHECK-NEXT: [SDS] 0123456789012345678901234567890{{$}}
-; CHECK-NEXT: [SDS] PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP{{$}}
-  call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, %args.type %args.final)
-  unreachable
-}
-
-; Check that other function calls to non-await functions are not accidentally considered as preserved.
-; CHECK-LABEL: [SDS] Analyzing function Intersection3
-define void @Intersection3(i32, i32, %args.type %args) !lgc.rt.shaderstage !1 {
-  %not.awaited = call %args.type @opaque(%args.type %args)
-  call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, %args.type %not.awaited)
-  ret void
-; CHECK:      [SDS] Finished analysis of function Intersection3
-; CHECK-NEXT: [SDS] 0         1         2         3{{$}}
-; CHECK-NEXT: [SDS] 0123456789012345678901234567890{{$}}
-; CHECK-NEXT: [SDS] DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD{{$}}
-}
-
-; Check that with awaits and phi nodes, we apply the value origin analysis to incoming values of phi nodes,
-; and not directly compare against incoming args and await results.
-; Check both the dynamic and constant case: Change dw0 dynamically, and dw1 to a constant.
-; Then conditionally await, and at the end jump using either the modified %args value or the await result.
-; The jump argument will be a phi result, and the incoming value
-; needs to go through value origin tracking to determine it's in fact
-; the incoming function argument, except for the modified dword.
-; We use two conditional awaits so also in the constant case (dw1), there are multiple
-; dynamic values coming into the phi node. With just a single one, value origin tracking
-; can see through the phi node and our phi node handling is not triggered.
-; CHECK-LABEL: [SDS] Analyzing function Intersection4
-define void @Intersection4(i32, i32, %args.type %args) !lgc.rt.shaderstage !1 {
-entry:
-  %dw1 = extractvalue %args.type %args, 0, 1
-  %args.modified.0 = insertvalue %args.type %args, i32 %dw1, 0, 0
-  %args.modified = insertvalue %args.type %args.modified.0, i32 0, 0, 1
-  ;%args.modified = insertvalue %args.type %args, i32 1337, 0, 0
-  %cond = trunc i32 %dw1 to i1
-  switch i32 %dw1, label %exit [
-    i32 0, label %conditional.0
-    i32 1, label %conditional.1
-  ]
-conditional.0:
-; CHECK-NEXT: [SDS] Analyzed outgoing call   %awaited.0.struct {{.*}}lgc.cps.await{{.*}} %args.modified)
-; CHECK-NEXT: [SDS] 0         1         2         3{{$}}
-; CHECK-NEXT: [SDS] 0123456789012345678901234567890{{$}}
-; CHECK-NEXT: [SDS] DCPPPPPPPPPPPPPPPPPPPPPPPPPPPPP{{$}}
-  %awaited.0.struct = call %awaited.args.type (...) @lgc.cps.await__(i32 poison, i32 poison, i32 poison, %args.type %args.modified)
-  %awaited.0 = extractvalue %awaited.args.type %awaited.0.struct, 0
-  br label %exit
-conditional.1:
-; CHECK-NEXT: [SDS] Analyzed outgoing call   %awaited.1.struct {{.*}}lgc.cps.await{{.*}} %args.modified)
-; CHECK-NEXT: [SDS] 0         1         2         3{{$}}
-; CHECK-NEXT: [SDS] 0123456789012345678901234567890{{$}}
-; CHECK-NEXT: [SDS] DCPPPPPPPPPPPPPPPPPPPPPPPPPPPPP{{$}}
-  %awaited.1.struct = call %awaited.args.type (...) @lgc.cps.await__(i32 poison, i32 poison, i32 poison, %args.type %args.modified)
-  %awaited.1 = extractvalue %awaited.args.type %awaited.1.struct, 0
-  br label %exit
-exit:
-  %args.final = phi %args.type [ %awaited.0, %conditional.0 ], [ %awaited.1, %conditional.1 ], [ %args.modified, %entry ]
-; CHECK:      [SDS] Analyzed outgoing call {{.*}} @lgc.cps.jump({{.*}} %args.final)
-; CHECK-NEXT: [SDS] 0         1         2         3{{$}}
-; CHECK-NEXT: [SDS] 0123456789012345678901234567890{{$}}
-; CHECK-NEXT: [SDS] DCPPPPPPPPPPPPPPPPPPPPPPPPPPPPP{{$}}
-  call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, %args.type %args.final)
-  unreachable
-}
-
-declare [4 x i32] @opaqueCandidate()
-
-; Traversal shader that contains jumps to an AHS setting a dynamic candidate, and a return back to raygen that preserves only parts of the args.
-; CHECK-LABEL: [SDS] Analyzing function Traversal1 (shader stage compute)
-define void @Traversal1(i32 %ret.addr, i32, { [2 x i32], [8 x i32] } %system.data, [4 x i32] %padding, [8 x i32] %payload) !lgc.rt.shaderstage !6 {
-  %cond = trunc i32 %ret.addr to i1
-  br i1 %cond, label %rgs.resume, label %ahs
-ahs:
-  %ahs.system.data.0 = insertvalue { { [2 x i32], [8 x i32] }, [4 x i32] } poison, { [2 x i32], [8 x i32] } %system.data, 0
-  %candidate = call [4 x i32] @opaqueCandidate()
-  %ahs.system.data = insertvalue { { [2 x i32], [8 x i32] }, [4 x i32] } %ahs.system.data.0, [4 x i32] %candidate, 1
-  call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, { { [2 x i32], [8 x i32] }, [4 x i32] } %ahs.system.data, [8 x i32] %payload)
-  unreachable
-rgs.resume:
-  %dispatch.system.data = extractvalue { [2 x i32], [8 x i32] } %system.data, 0
-  call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, [2 x i32] %dispatch.system.data, [12 x i32] poison, [8 x i32] %payload)
-  unreachable
-; CHECK-LABEL: [SDS] Finished analysis of function Traversal1
-; CHECK-NEXT:  [SDS] 0         1         2
-; CHECK-NEXT:  [SDS] 0123456789012345678901
-; CHECK-NEXT:  [SDS] PPUUUUUUUUDDDDPPPPPPPP
-}
-
-; Same as above, but without padding args.
-; Hypothetical traversal calling an AHS with a larger arg size, and a RGS with smaller arg size.
-; This tests mismatching incoming vs outgoing arg sizes.
-; CHECK-LABEL: [SDS] Analyzing function Traversal2 (shader stage compute)
-define void @Traversal2(i32 %ret.addr, i32, { [2 x i32], [8 x i32] } %system.data, [8 x i32] %payload) !lgc.rt.shaderstage !6 {
-  %cond = trunc i32 %ret.addr to i1
-  br i1 %cond, label %rgs.resume, label %ahs
-ahs:
-  %ahs.system.data.0 = insertvalue { { [2 x i32], [8 x i32] }, [4 x i32] } poison, { [2 x i32], [8 x i32] } %system.data, 0
-  %candidate = call [4 x i32] @opaqueCandidate()
-  %ahs.system.data = insertvalue { { [2 x i32], [8 x i32] }, [4 x i32] } %ahs.system.data.0, [4 x i32] %candidate, 1
-; CHECK-NEXT: [SDS] Analyzed outgoing call   call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, { { [2 x i32], [8 x i32] }, [4 x i32] } %ahs.system.data, [8 x i32] %payload)
-; CHECK-NEXT: [SDS] 0         1         2
-; CHECK-NEXT: [SDS] 0123456789012345678901
-; CHECK-NEXT: [SDS] PPPPPPPPPPDDDDDDDDDDDD
-  call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, { { [2 x i32], [8 x i32] }, [4 x i32] } %ahs.system.data, [8 x i32] %payload)
-  unreachable
-rgs.resume:
-  %dispatch.system.data = extractvalue { [2 x i32], [8 x i32] } %system.data, 0
-; CHECK-NEXT: [SDS] Analyzed outgoing call   call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, [2 x i32] %dispatch.system.data, [8 x i32] %payload)
-; CHECK-NEXT: [SDS] 0
-; CHECK-NEXT: [SDS] 0123456789
-; CHECK-NEXT: [SDS] PPDDDDDDDD
-  call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, [2 x i32] %dispatch.system.data, [8 x i32] %payload)
-  unreachable
-; CHECK-NEXT: [SDS] Finished analysis of function Traversal2
-; CHECK-NEXT: [SDS] 0         1         2
-; CHECK-NEXT: [SDS] 0123456789012345678901
-; CHECK-NEXT: [SDS] PPDDDDDDDDDDDDDDDDDDDD
-}
-
-; %args.with.padding requires 6 registers as argument, but 8 dwords in memory
-; Test that we correctly map the argument slots into the in-memory type layout,
-; by extracting the individual dword values, and passing them as scalars to an outgoing jump.
-; This should be detected as preserve.
-; CHECK-LABEL: [SDS] Analyzing function JumpWithPaddingInType
-define void @JumpWithPaddingInType(i32 %ret.addr, i32, %args.with.padding %args) !lgc.rt.shaderstage !2 {
-  %scalar.0 = extractvalue %args.with.padding %args, 0
-  %scalar.1 = extractvalue %args.with.padding %args, 1
-  %scalar.2 = extractvalue %args.with.padding %args, 2, 0
-  %scalar.3 = extractvalue %args.with.padding %args, 2, 1
-  call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 %scalar.0, i64 %scalar.1, i32 %scalar.2, i64 %scalar.3)
-  unreachable
-; CHECK-LABEL: [SDS] Finished analysis of function JumpWithPaddingInType
-; CHECK-NEXT:  [SDS] 0
-; CHECK-NEXT:  [SDS] 012345
-; CHECK-NEXT:  [SDS] PPPPPP
-}
-
-; Same as above, but for awaits results.
-; CHECK-LABEL: [SDS] Analyzing function AwaitWithPaddingInType
-define void @AwaitWithPaddingInType(i32 %ret.addr, i32, %args.with.padding %args) !lgc.rt.shaderstage !1 {
-  ; Intentionally do not wrap %args in a struct -- instead pretend the await function returns
-  ; the elements of %args as separate args, so we can test the mapping of arg slots into the returned struct
-  ; with multiple struct elements.
-  %awaited = call %args.with.padding (...) @lgc.cps.await__p(i32 poison, i32 poison, i32 poison, %args.with.padding %args)
-  %scalar.0 = extractvalue %args.with.padding %awaited, 0
-  %scalar.1 = extractvalue %args.with.padding %awaited, 1
-  %scalar.2 = extractvalue %args.with.padding %awaited, 2, 0
-  %scalar.3 = extractvalue %args.with.padding %awaited, 2, 1
-  call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 %scalar.0, i64 %scalar.1, i32 %scalar.2, i64 %scalar.3)
-  unreachable
-; CHECK-LABEL: [SDS] Finished analysis of function AwaitWithPaddingInType
-; CHECK-NEXT:  [SDS] 0
-; CHECK-NEXT:  [SDS] 012345
-; CHECK-NEXT:  [SDS] PPPPPP
-}
-
-; Check that we don't treat a single passed-through i16 as preserve. The high outgoing bits are poison,
-; so in theory we could treat this as preserve, because only non-poison bits are relevant for the analysis,
-; but currently we handle i16s conservatively. Properly supporting i16s is complicated, because incoming poison
-; bits that might even be implicit in the in-memory representation of a type need to be accounted for.
-; For instance, consider the example that forwards an incoming <2 x i16> argument to a bitcast outgoing i32 argument
-; in the JumpWithOverlappingi16s test case.
-; CHECK-LABEL: [SDS] Analyzing function JumpWithSinglei16
-define void @JumpWithSinglei16(i32 %ret.addr, i32, i16 %arg) !lgc.rt.shaderstage !2 {
-; Forward arg as-is.
-; CHECK-NEXT: [SDS] Analyzed outgoing call   call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i16 %arg)
-; CHECK-NEXT: [SDS] 0
-; CHECK-NEXT: [SDS] 0
-; CHECK-NEXT: [SDS] D
-  call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i16 %arg)
-  unreachable
-}
-
-; Check that we don't treat a misaligned passed-through dword as preserve. Use a packed struct to force misalignment.
-; CHECK-LABEL: [SDS] Analyzing function JumpWithMisalignedDword
-define void @JumpWithMisalignedDword(i32 %ret.addr, i32, <{ i16, i32 }> %args) !lgc.rt.shaderstage !2 {
-  switch i32 %ret.addr, label %conditional.0 [
-    i32 0, label %conditional.0
-    i32 1, label %conditional.1
-  ]
-conditional.0:
-; Forward args as-is.
-; CHECK-NEXT: [SDS] Analyzed outgoing call   call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, <{ i16, i32 }> %args)
-; CHECK-NEXT: [SDS] 0
-; CHECK-NEXT: [SDS] 01
-; CHECK-NEXT: [SDS] DD
-  call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, <{ i16, i32 }> %args)
-  unreachable
-conditional.1:
-; Forward extracted scalars.
-  %scalar.0 = extractvalue <{ i16, i32 }> %args, 0
-  %scalar.1 = extractvalue <{ i16, i32 }> %args, 1
-; CHECK-NEXT: [SDS] Analyzed outgoing call   call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i16 %scalar.0, i32 %scalar.1)
-; CHECK-NEXT: [SDS] 0
-; CHECK-NEXT: [SDS] 01
-; CHECK-NEXT: [SDS] DD
-  call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i16 %scalar.0, i32 %scalar.1)
-  unreachable
-  unreachable
-}
-
-; All cases involving i16 scalars should not be treated as preserve, as the i16 cannot guarantee to preserve high bits.
-; Additionally, there can be issues with alignment.
-; CHECK-LABEL: [SDS] Analyzing function JumpWithOverlappingi16s
-define void @JumpWithOverlappingi16s(i32 %ret.addr, i32, <2 x i16> %args) !lgc.rt.shaderstage !2 {
-  switch i32 %ret.addr, label %conditional.2 [
-    i32 0, label %conditional.0
-    i32 1, label %conditional.1
-    i32 2, label %conditional.2
-  ]
-conditional.0:
-; CHECK-NEXT: [SDS] Analyzed outgoing call   call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, <2 x i16> %args)
-; CHECK-NEXT: [SDS] 0
-; CHECK-NEXT: [SDS] 01
-; CHECK-NEXT: [SDS] DD
-  call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, <2 x i16> %args)
-  unreachable
-conditional.1:
-; Forward extracted scalars. This preserves arg slots, but we can't detect it.
-  %scalar.0 = extractelement <2 x i16> %args, i32 0
-  %scalar.1 = extractelement <2 x i16> %args, i32 1
-; CHECK-NEXT: [SDS] Analyzed outgoing call   call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i16 %scalar.0, i16 %scalar.1)
-; CHECK-NEXT: [SDS] 0
-; CHECK-NEXT: [SDS] 01
-; CHECK-NEXT: [SDS] DD
-  call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i16 %scalar.0, i16 %scalar.1)
-  unreachable
-conditional.2:
-; Forward just the bitcast. This does *not* preserve arg slots, as we merge both i16s into a single i32 arg slot.
-; Even when relaxing i16 handling and allowing to treat forwarded i16 arguments as preserve, exploiting that the high bits
-; are poison, we may not treat this as preserve. A naive implementation that just compares the value origin of the
-; outgoing %bitcast argument with the corresponding incoming argument slot (value %args, offset 0) might come to the conclusion that it is
-; preserved. But when allowing i16s, we need to additionally account for the incoming high poison bits that are implicit
-; in the in-memory representation of %args.
-; CHECK-NEXT: [SDS] Analyzed outgoing call   call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 %bitcast)
-; CHECK-NEXT: [SDS] 0
-; CHECK-NEXT: [SDS] 0
-; CHECK-NEXT: [SDS] D
-  %bitcast = bitcast <2 x i16> %args to i32
-  call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 %bitcast)
-  unreachable
-}
-
-; Same as above, but with awaits.
-; CHECK-LABEL: [SDS] Analyzing function AwaitWithOverlappingi16s
-define void @AwaitWithOverlappingi16s(i32 %ret.addr, i32, <2 x i16> %args) !lgc.rt.shaderstage !2 {
-  switch i32 %ret.addr, label %conditional.2 [
-    i32 0, label %conditional.0
-    i32 1, label %conditional.1
-    i32 2, label %conditional.2
-  ]
-conditional.0:
-; Forward args as-is through an await.
-  %awaited.0.struct = call { <2 x i16> } (...) @lgc.cps.await__2xi16(i32 poison, i32 poison, i32 poison, <2 x i16> %args)
-  %awaited.0 = extractvalue { <2 x i16> } %awaited.0.struct, 0
-; CHECK-LABEL: [SDS] Analyzed outgoing call   call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, <2 x i16> %awaited.0)
-; CHECK-NEXT: [SDS] 0
-; CHECK-NEXT: [SDS] 01
-; CHECK-NEXT: [SDS] DD
-  call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, <2 x i16> %awaited.0)
-  unreachable
-conditional.1:
-; Forward extracted scalars through an await.
-  %scalar.0 = extractelement <2 x i16> %args, i32 0
-  %scalar.1 = extractelement <2 x i16> %args, i32 1
-  %awaited.1.struct = call { i16, i16 } (...) @lgc.cps.await__i16i16(i32 poison, i32 poison, i32 poison, i16 %scalar.0, i16 %scalar.1)
-  %awaited.1.0 = extractvalue { i16, i16 } %awaited.1.struct, 0
-  %awaited.1.1 = extractvalue { i16, i16 } %awaited.1.struct, 1
-; CHECK:      [SDS] Analyzed outgoing call   call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i16 %awaited.1.0, i16 %awaited.1.1)
-; CHECK-NEXT: [SDS] 0
-; CHECK-NEXT: [SDS] 01
-; CHECK-NEXT: [SDS] DD
-  call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i16 %awaited.1.0, i16 %awaited.1.1)
-  unreachable
-conditional.2:
-; Forward just the bitcast. This does *not* preserve arg slots, as we merge both i16s into a single arg slot.
-; CHECK-NEXT: [SDS] Analyzed outgoing call   call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 %bitcast)
-; CHECK-NEXT: [SDS] 0
-; CHECK-NEXT: [SDS] 0
-; CHECK-NEXT: [SDS] D
-  %bitcast = bitcast <2 x i16> %args to i32
-  call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 %bitcast)
-  unreachable
-}
-
-; Check that we ignore callable shaders
-define void @Callable(i32 %ret.addr, i32, %args.type %args) !lgc.rt.shaderstage !5 {
-  call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, %args.type %args)
-  unreachable
-; CHECK-NOT: [SDS] Finished analysis of function Callable
-}
-
-; Check that we ignore launch kernel shaders
-define void @LaunchKernel(i32 %ret.addr, i32, %args.type %args) !lgc.rt.shaderstage !7 {
-  call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, %args.type %args)
-  unreachable
-; CHECK-NOT: [SDS] Finished analysis of function LaunchKernel
-}
-
-; CHECK: [SDS] Serialized state to MD:
-!lgc.cps.module = !{}
-!lgc.rt.specialize.driver.shaders.process.in.instruction.order = !{}
-
-!1 = !{i32 1} ; Intersection
-!2 = !{i32 2} ; AHS
-!5 = !{i32 5} ; Callable
-!6 = !{i32 6} ; Traversal
-!7 = !{i32 7} ; KernelEntry
diff --git a/llvmraytracing/test/dx/specialize-driver-shaders/lower-rt-pipeline-args.ll b/llvmraytracing/test/dx/specialize-driver-shaders/lower-rt-pipeline-args.ll
deleted file mode 100644
index a5c9ac49ee..0000000000
--- a/llvmraytracing/test/dx/specialize-driver-shaders/lower-rt-pipeline-args.ll
+++ /dev/null
@@ -1,468 +0,0 @@
-; NOTE: Do not autogenerate
-; RUN: opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,sroa,specialize-driver-shaders,lint,remove-types-metadata" -S --lint-abort-on-error -debug-only='specialize-driver-shaders' %s 2>&1 | FileCheck %s
-;
-; Test that argument layouts (number of ignored arguments) expected in specialize-driver-shaders matches what lower-raytracing-pipeline does.
-; Intentionally only test non-lgc.cps-mode, as lgc.cps mode requires different arguments in test IR,
-; and as it is already tested as part of an LLPC offline pipeline compilation test.
-;
-; REQUIRES: assertions
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-%dx.types.Handle = type { i8* }
-%struct.DispatchSystemData = type { <3 x i32> }
-%struct.TraversalData = type { %struct.SystemData, %struct.HitData, <3 x float>, <3 x float>, float, i32 }
-%struct.SystemData = type { %struct.DispatchSystemData }
-%struct.HitData = type { <3 x float>, <3 x float>, float, i32 }
-%struct.AnyHitTraversalData = type { %struct.TraversalData, %struct.HitData }
-%struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
-%struct.RayPayload = type { <4 x float> }
-%dx.types.ResourceProperties = type { i32, i32 }
-%struct.BuiltInTriangleIntersectionAttributes2 = type { <2 x float> }
-%struct.RaytracingAccelerationStructure = type { i32 }
-%"class.RWTexture2D<vector<float, 4> >" = type { <4 x float> }
-
-@"\01?Scene@@3URaytracingAccelerationStructure@@A" = external constant %dx.types.Handle, align 4
-@"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" = external constant %dx.types.Handle, align 4
-
-define i32 @_cont_GetContinuationStackAddr() #0 {
-  ret i32 0
-}
-
-define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !pointeetys !{%struct.DispatchSystemData poison} {
-  ret void
-}
-
-declare %struct.DispatchSystemData @_AmdAwaitTraversal(i32, %struct.TraversalData) #0
-
-declare %struct.DispatchSystemData @_AmdAwaitShader(i32, i32, %struct.DispatchSystemData) #0
-
-declare %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i32, i32, %struct.AnyHitTraversalData, float, i32) #0
-
-define %struct.HitData @_cont_GetCandidateState(%struct.AnyHitTraversalData* %data) #0 !pointeetys !32 {
-  %resPtr = getelementptr %struct.AnyHitTraversalData, %struct.AnyHitTraversalData* %data, i32 0, i32 0
-  %res = load %struct.HitData, %struct.HitData* %resPtr, align 4
-  ret %struct.HitData %res
-}
-
-declare !pointeetys !34 %struct.HitData @_cont_GetCommittedState(%struct.SystemData*) #0
-
-declare !pointeetys !36 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData*) #0
-
-define void @_cont_SetTriangleHitAttributes(%struct.SystemData* %data, %struct.BuiltInTriangleIntersectionAttributes %val) !pointeetys !37 {
-  %addr = getelementptr %struct.SystemData, %struct.SystemData* %data, i32 0, i32 0
-  store %struct.BuiltInTriangleIntersectionAttributes %val, %struct.BuiltInTriangleIntersectionAttributes* %addr, align 4
-  ret void
-}
-
-define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) !pointeetys !38 {
-  ret i32 5
-}
-
-declare i1 @opaqueIsEnd()
-
-define i1 @_cont_IsEndSearch(%struct.TraversalData*) #0 !pointeetys !40 {
-  %isEnd = call i1 @opaqueIsEnd()
-  ret i1 %isEnd
-}
-
-declare !pointeetys !42 i32 @_cont_HitKind(%struct.SystemData*) #0
-
-; Function Attrs: nounwind
-declare i32 @_AmdGetResumePointAddr() #1
-
-; Function Attrs: nounwind
-declare !pointeetys !43 void @_AmdRestoreSystemData(%struct.DispatchSystemData*) #1
-
-; Function Attrs: nounwind
-declare !pointeetys !44 void @_AmdRestoreSystemDataAnyHit(%struct.AnyHitTraversalData*) #1
-
-; Function Attrs: nounwind
-declare !pointeetys !43 void @_cont_AcceptHitAndEndSearch(%struct.DispatchSystemData* nocapture readnone) #1
-
-; Function Attrs: nounwind
-declare !pointeetys !44 void @_cont_AcceptHit(%struct.AnyHitTraversalData* nocapture readnone) #1
-
-; Function Attrs: nounwind
-declare !pointeetys !43 void @_cont_IgnoreHit(%struct.DispatchSystemData* nocapture readnone) #1
-
-; Function Attrs: nounwind
-declare !pointeetys !44 void @_AmdAcceptHitAttributes(%struct.AnyHitTraversalData* nocapture readnone) #1
-
-define void @_cont_TraceRay(%struct.DispatchSystemData* %data, i64 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, float %6, float %7, float %8, float %9, float %10, float %11, float %12, float %13) #0 !pointeetys !45 {
-  %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4
-  %sys_data = insertvalue %struct.SystemData zeroinitializer, %struct.DispatchSystemData %dis_data, 0
-  %trav_data = insertvalue %struct.TraversalData zeroinitializer, %struct.SystemData %sys_data, 0
-  %addr = call i32 @_AmdGetResumePointAddr() #3
-  %trav_data2 = insertvalue %struct.TraversalData %trav_data, i32 %addr, 5
-  %newdata = call %struct.DispatchSystemData @_AmdAwaitTraversal(i32 4, %struct.TraversalData %trav_data2)
-  store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4
-  call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data)
-  ret void
-}
-
-define void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) #0 !pointeetys !46 {
-  %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4
-  %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i32 2, i32 poison, %struct.DispatchSystemData %dis_data)
-  store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4
-  call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data)
-  ret void
-}
-
-define i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind) #0 !pointeetys !47 {
-  %origTPtr = getelementptr inbounds %struct.AnyHitTraversalData, %struct.AnyHitTraversalData* %data, i32 0, i32 0, i32 4
-  %origT = load float, float* %origTPtr, align 4
-  %isNoHit = fcmp fast uge float %t, %origT
-  br i1 %isNoHit, label %isEnd, label %callAHit
-
-callAHit:                                         ; preds = %0
-  %trav_data = load %struct.AnyHitTraversalData, %struct.AnyHitTraversalData* %data, align 4
-  %newdata = call %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i32 3, i32 poison, %struct.AnyHitTraversalData %trav_data, float %t, i32 %hitKind)
-  store %struct.AnyHitTraversalData %newdata, %struct.AnyHitTraversalData* %data, align 4
-  call void @_AmdRestoreSystemDataAnyHit(%struct.AnyHitTraversalData* %data)
-  ret i1 true
-
-isEnd:                                            ; preds = %0
-  ; Call AcceptHitAttributes, just to simulate it
-  call void @_AmdAcceptHitAttributes(%struct.AnyHitTraversalData* %data)
-  ret i1 false
-}
-
-define <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData* %data) !pointeetys !48 {
-  %resPtr.1 = getelementptr %struct.DispatchSystemData, %struct.DispatchSystemData* %data, i32 0, i32 0, i32 0
-  %res.1 = load i32, i32* %resPtr.1, align 4
-  %resPtr.2 = getelementptr %struct.DispatchSystemData, %struct.DispatchSystemData* %data, i32 0, i32 0, i32 1
-  %res.2 = load i32, i32* %resPtr.2, align 4
-  %resPtr.3 = getelementptr %struct.DispatchSystemData, %struct.DispatchSystemData* %data, i32 0, i32 0, i32 2
-  %res.3 = load i32, i32* %resPtr.3, align 4
-  %val.0 = insertelement <3 x i32> undef, i32 %res.1, i32 0
-  %val.1 = insertelement <3 x i32> %val.0, i32 %res.2, i32 1
-  %val.2 = insertelement <3 x i32> %val.1, i32 %res.3, i32 2
-  ret <3 x i32> %val.2
-}
-
-define <3 x float> @_cont_ObjectRayOrigin3(%struct.DispatchSystemData* nocapture readnone %data, %struct.HitData* %hitData) !pointeetys !49 {
-  %resPtr.1 = getelementptr %struct.HitData, %struct.HitData* %hitData, i32 0, i32 0, i32 0
-  %res.1 = load float, float* %resPtr.1, align 4
-  %resPtr.2 = getelementptr %struct.HitData, %struct.HitData* %hitData, i32 0, i32 0, i32 1
-  %res.2 = load float, float* %resPtr.2, align 4
-  %resPtr.3 = getelementptr %struct.HitData, %struct.HitData* %hitData, i32 0, i32 0, i32 2
-  %res.3 = load float, float* %resPtr.3, align 4
-  %val.0 = insertelement <3 x float> undef, float %res.1, i32 0
-  %val.1 = insertelement <3 x float> %val.0, float %res.2, i32 1
-  %val.2 = insertelement <3 x float> %val.1, float %res.3, i32 2
-  ret <3 x float> %val.2
-}
-
-define <3 x float> @_cont_ObjectRayDirection3(%struct.DispatchSystemData* nocapture readnone %data, %struct.HitData* %hitData) !pointeetys !49 {
-  %resPtr.1 = getelementptr %struct.HitData, %struct.HitData* %hitData, i32 0, i32 1, i32 0
-  %res.1 = load float, float* %resPtr.1, align 4
-  %resPtr.2 = getelementptr %struct.HitData, %struct.HitData* %hitData, i32 0, i32 1, i32 1
-  %res.2 = load float, float* %resPtr.2, align 4
-  %resPtr.3 = getelementptr %struct.HitData, %struct.HitData* %hitData, i32 0, i32 1, i32 2
-  %res.3 = load float, float* %resPtr.3, align 4
-  %val.0 = insertelement <3 x float> undef, float %res.1, i32 0
-  %val.1 = insertelement <3 x float> %val.0, float %res.2, i32 1
-  %val.2 = insertelement <3 x float> %val.1, float %res.3, i32 2
-  ret <3 x float> %val.2
-}
-
-define float @_cont_RayTCurrent(%struct.DispatchSystemData* nocapture readnone %data, %struct.HitData* %hitData) !pointeetys !51 {
-  %resPtr = getelementptr %struct.HitData, %struct.HitData* %hitData, i32 0, i32 2
-  %res = load float, float* %resPtr, align 4
-  ret float %res
-}
-
-; RayGen: In this test case, we have mostly constant system data (_cont_Traceray uses zero-initialized traversal system data),
-;         undef padding for the candidate, and constant payload. The storage for committed hit attributes
-;         within the payload storage is undef as well.
-;         Note that the dispatch system data (passed in the first args) is dynamic although it preserves an
-;         argument incoming to RayGen. This is because we only allow arg preservation *within* Traversal.
-; CHECK-LABEL: [SDS] Finished analysis of function MyRayGen
-; CHECK-NEXT:  [SDS] 0         1         2         3         4     {{$}}
-; CHECK-NEXT:  [SDS] 012345678901234567890123456789012345678901234{{$}}
-; CHECK-NEXT:  [SDS] DDDCCCCCCCCCCCCCCCDUUUUUUUUUUUUUUUUCUUUUUUCCC{{$}}
-;                    ^^^ dynamic dispatch system data
-;                       ^^^^^^^^^^^^^^^ constant ray
-;                                      ^^ dynamic raygen.resume return addr
-;                                        ^^^^^^^^^^^^^^^^ undef candidate
-;                                                        ^      ^^^ constant payload
-;                                                         ^^^^^^ undef committed attrs
-define void @MyRayGen() #2 {
-  %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
-  %2 = load %dx.types.Handle, %dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
-  %3 = alloca %struct.RayPayload, align 4
-  %4 = bitcast %struct.RayPayload* %3 to i8*
-  call void @llvm.lifetime.start.p0i8(i64 16, i8* %4) #1
-  %5 = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %3, i32 0, i32 0
-  store <4 x float> zeroinitializer, <4 x float>* %5, align 4, !tbaa !52
-  %6 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1)
-  %7 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %6, %dx.types.ResourceProperties { i32 16, i32 0 })
-  call void @dx.op.traceRay.struct.RayPayload(i32 157, %dx.types.Handle %7, i32 16, i32 -1, i32 0, i32 1, i32 0, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0x3F50624DE0000000, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+04, %struct.RayPayload* nonnull %3)
-  %8 = load <4 x float>, <4 x float>* %5, align 4, !tbaa !52
-  %9 = call i32 @dx.op.dispatchRaysIndex.i32(i32 145, i8 0)
-  %10 = call i32 @dx.op.dispatchRaysIndex.i32(i32 145, i8 1)
-  %11 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %2)
-  %12 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %11, %dx.types.ResourceProperties { i32 4098, i32 1033 })
-  %13 = extractelement <4 x float> %8, i64 0
-  %14 = extractelement <4 x float> %8, i64 1
-  %15 = extractelement <4 x float> %8, i64 2
-  %16 = extractelement <4 x float> %8, i64 3
-  call void @dx.op.textureStore.f32(i32 67, %dx.types.Handle %12, i32 %9, i32 %10, i32 undef, float %13, float %14, float %15, float %16, i8 15)
-  call void @llvm.lifetime.end.p0i8(i64 16, i8* %4) #1
-  ret void
-}
-
-; Non-recursive CHS: No calls to Traversal, so no state to report.
-; CHECK-LABEL: [SDS] Finished analysis of function MyClosestHitShader
-; CHECK-NEXT:  [SDS] <empty>
-define void @MyClosestHitShader(%struct.RayPayload* noalias nocapture %payload, %struct.BuiltInTriangleIntersectionAttributes* nocapture readonly %attr) #2 !pointeetys !55 {
-  %1 = getelementptr inbounds %struct.BuiltInTriangleIntersectionAttributes, %struct.BuiltInTriangleIntersectionAttributes* %attr, i32 0, i32 0
-  %2 = load <2 x float>, <2 x float>* %1, align 4
-  %3 = extractelement <2 x float> %2, i32 0
-  %4 = fsub fast float 1.000000e+00, %3
-  %5 = extractelement <2 x float> %2, i32 1
-  %6 = fsub fast float %4, %5
-  %7 = insertelement <4 x float> undef, float %6, i64 0
-  %8 = insertelement <4 x float> %7, float %3, i64 1
-  %9 = insertelement <4 x float> %8, float %5, i64 2
-  %10 = insertelement <4 x float> %9, float 1.000000e+00, i64 3
-  %11 = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %payload, i32 0, i32 0
-  store <4 x float> %10, <4 x float>* %11, align 4
-  ret void
-}
-
-; AnyHit: Payload and committed hit attrs are preserved.
-; CHECK-LABEL: [SDS] Finished analysis of function MyAnyHitShader
-; CHECK-NEXT:  [SDS] 0         1         2         3         4     {{$}}
-; CHECK-NEXT:  [SDS] 012345678901234567890123456789012345678901234{{$}}
-; CHECK-NEXT:  [SDS] DDDDDDDDDDDDDDDDDDDDDDDDDDDUUUUUUUUPPPPPPPPPP{{$}}
-define void @MyAnyHitShader(%struct.RayPayload* noalias nocapture %payload, %struct.BuiltInTriangleIntersectionAttributes* nocapture readnone %attr) #2 !pointeetys !55 {
-  %1 = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %payload, i32 0, i32 0
-  %2 = load <4 x float>, <4 x float>* %1, align 4
-  %3 = call float @dx.op.objectRayOrigin.f32(i32 149, i8 0)
-  %4 = call float @dx.op.objectRayDirection.f32(i32 150, i8 0)
-  %5 = call float @dx.op.rayTCurrent.f32(i32 154)
-  %6 = fmul fast float %5, %4
-  %7 = fadd fast float %6, %3
-  %8 = fcmp fast ogt float %7, 0.000000e+00
-  %9 = fcmp fast ogt float %7, 1.000000e+00
-  %10 = fcmp fast ogt float %7, -1.000000e+00
-  br i1 %8, label %11, label %14
-
-11:                                               ; preds = %0
-; acceptHitAndEndSearch
-  store <4 x float> %2, <4 x float>* %1, align 4
-  br i1 %9, label %12, label %13
-
-12:                                               ; preds = %11
-; acceptHitAndEndSearch with unreachable
-  call void @dx.op.acceptHitAndEndSearch(i32 156)
-  unreachable
-
-13:                                               ; preds = %11
-; acceptHitAndEndSearch with ret void
-  call void @dx.op.acceptHitAndEndSearch(i32 156)
-  ret void
-
-14:                                               ; preds = %0
-; IgnoreHit or AcceptHit
-  br i1 %10, label %15, label %18
-
-15:                                               ; preds = %14
-; IgnoreHit
-  br i1 %9, label %16, label %17
-
-16:                                               ; preds = %15
-; IgnoreHit with unreachable
-  call void @dx.op.ignoreHit(i32 155)
-  unreachable
-
-17:                                               ; preds = %15
-; IgnoreHit with ret void (as emitted by debug mode dxc)
-  call void @dx.op.ignoreHit(i32 155)
-  ret void
-
-18:                                               ; preds = %14
-; AcceptHit
-  store <4 x float> %2, <4 x float>* %1, align 4
-  ret void
-}
-
-; Intersection: The payload is preserved, even across ReportHit calls.
-; Six Argument slots unused by the small hit attributes are undef.
-; CHECK-LABEL: [SDS] Finished analysis of function MyIntersectionShader
-; CHECK-NEXT:  [SDS] 0         1         2         3         4         5         6     {{$}}
-; CHECK-NEXT:  [SDS] 01234567890123456789012345678901234567890123456789012345678901234{{$}}
-; CHECK-NEXT:  [SDS] DDDPPPPPPPPPPPPPPPPPPPPPPPPDCUUUUUUPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP{{$}}
-define void @MyIntersectionShader() #2 {
-  %1 = alloca %struct.BuiltInTriangleIntersectionAttributes, align 4
-  %2 = call float @dx.op.rayTCurrent.f32(i32 154)
-  %3 = bitcast %struct.BuiltInTriangleIntersectionAttributes* %1 to i8*
-  call void @llvm.lifetime.start.p0i8(i64 8, i8* %3) #1
-  %4 = call i1 @dx.op.reportHit.struct.BuiltInTriangleIntersectionAttributes(i32 158, float %2, i32 0, %struct.BuiltInTriangleIntersectionAttributes* nonnull %1)
-  call void @llvm.lifetime.end.p0i8(i64 8, i8* %3) #1
-  ret void
-}
-
-; Intersection with ReportHit in a loop: The analysis doesn't understand that the payload is preserved,
-; because we don't repeatedly propagate through loops. This could be improved in ValueOriginTracking.
-; CHECK-LABEL: [SDS] Finished analysis of function MyIntersectionShaderLoop
-; CHECK-NEXT:  [SDS] 0         1         2         3         4         5         6     {{$}}
-; CHECK-NEXT:  [SDS] 01234567890123456789012345678901234567890123456789012345678901234{{$}}
-; CHECK-NEXT:  [SDS] DDDDDDDDDDDDDDDDDDDDDDDDDDDDCUUUUUUDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD{{$}}
-define void @MyIntersectionShaderLoop() #2 {
-  %1 = alloca %struct.BuiltInTriangleIntersectionAttributes, align 4
-  %2 = call float @dx.op.rayTCurrent.f32(i32 154)
-  %3 = bitcast %struct.BuiltInTriangleIntersectionAttributes* %1 to i8*
-  br label %loop
-loop:
-  %4 = call i1 @dx.op.reportHit.struct.BuiltInTriangleIntersectionAttributes(i32 158, float %2, i32 0, %struct.BuiltInTriangleIntersectionAttributes* nonnull %1)
-  br i1 %4, label %loop, label %exit
-exit:
-  ret void
-}
-
-; Non-recursive Miss: No calls to Traversal, so no state to report.
-; CHECK-LABEL: [SDS] Finished analysis of function MyMissShader
-; CHECK-NEXT:  [SDS] <empty>
-define void @MyMissShader(%struct.RayPayload* noalias nocapture %payload) #2 !pointeetys !58 {
-  %1 = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %payload, i32 0, i32 0
-  store <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, <4 x float>* %1, align 4
-  ret void
-}
-
-; Recursive Miss: The passes through the incoming payload to traceRay, but it's treated as dynamic because miss is outside of Traversal.
-; CHECK-LABEL: [SDS] Finished analysis of function MyMissShaderRecursive
-; CHECK-NEXT:  [SDS] 0         1         2         3         4     {{$}}
-; CHECK-NEXT:  [SDS] 012345678901234567890123456789012345678901234{{$}}
-; CHECK-NEXT:  [SDS] DDDCCCCCCCCCCCCCCCDUUUUUUUUUUUUUUUUDDDDDDDDDD{{$}}
-define void @MyMissShaderRecursive(%struct.RayPayload* noalias nocapture %payload) #2 !pointeetys !58 {
-  %tmp1 = load %dx.types.Handle, %dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
-  %tmp6 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %tmp1)
-  %tmp7 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %tmp6, %dx.types.ResourceProperties { i32 16, i32 0 })
-  call void @dx.op.traceRay.struct.RayPayload(i32 157, %dx.types.Handle %tmp7, i32 16, i32 -1, i32 0, i32 1, i32 0, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0x3F50624DE0000000, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+04, %struct.RayPayload* nonnull %payload)
-  ret void
-}
-
-; Function Attrs: nounwind
-declare !pointeetys !59 void @dx.op.traceRay.struct.RayPayload(i32, %dx.types.Handle, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, %struct.RayPayload*) #1
-
-; Function Attrs: nounwind
-declare void @dx.op.textureStore.f32(i32, %dx.types.Handle, i32, i32, i32, float, float, float, float, i8) #1
-
-; Function Attrs: nounwind memory(none)
-declare i32 @dx.op.dispatchRaysIndex.i32(i32, i8) #3
-
-; Function Attrs: nounwind memory(none)
-declare float @dx.op.objectRayDirection.f32(i32, i8) #3
-
-; Function Attrs: nounwind memory(none)
-declare float @dx.op.objectRayOrigin.f32(i32, i8) #3
-
-; Function Attrs: nounwind memory(read)
-declare float @dx.op.rayTCurrent.f32(i32) #4
-
-declare void @dx.op.acceptHitAndEndSearch(i32) #0
-
-declare void @dx.op.ignoreHit(i32) #0
-
-; Function Attrs: nounwind
-declare !pointeetys !60 i1 @dx.op.reportHit.struct.BuiltInTriangleIntersectionAttributes(i32, float, i32, %struct.BuiltInTriangleIntersectionAttributes*) #1
-
-; Function Attrs: nounwind
-declare !pointeetys !61 i1 @dx.op.reportHit.struct.BuiltInTriangleIntersectionAttributes2(i32, float, i32, %struct.BuiltInTriangleIntersectionAttributes2*) #1
-
-; Function Attrs: nounwind memory(none)
-declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #3
-
-; Function Attrs: nounwind memory(read)
-declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.types.Handle) #4
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare !pointeetys !63 void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #5
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare !pointeetys !63 void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #5
-
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-attributes #2 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind memory(none) }
-attributes #4 = { nounwind memory(read) }
-attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
-
-!llvm.ident = !{!0}
-!dx.version = !{!1}
-!dx.valver = !{!1}
-!dx.shaderModel = !{!2}
-!dx.resources = !{!3}
-!dx.typeAnnotations = !{!10}
-!dx.entryPoints = !{!18, !20, !23, !25, !27, !29, !31, !65}
-
-!0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"}
-!1 = !{i32 1, i32 6}
-!2 = !{!"lib", i32 6, i32 6}
-!3 = !{!4, !7, null, null}
-!4 = !{!5}
-!5 = !{i32 0, %struct.RaytracingAccelerationStructure* bitcast (%dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A" to %struct.RaytracingAccelerationStructure*), !"Scene", i32 0, i32 0, i32 1, i32 16, i32 0, !6}
-!6 = !{i32 0, i32 4}
-!7 = !{!8}
-!8 = !{i32 0, %"class.RWTexture2D<vector<float, 4> >"* bitcast (%dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" to %"class.RWTexture2D<vector<float, 4> >"*), !"RenderTarget", i32 0, i32 0, i32 1, i32 2, i1 false, i1 false, i1 false, !9}
-!9 = !{i32 0, i32 9}
-!10 = !{i32 1, void ()* @MyRayGen, !11, void (%struct.RayPayload*, %struct.BuiltInTriangleIntersectionAttributes*)* @MyClosestHitShader, !14, void (%struct.RayPayload*, %struct.BuiltInTriangleIntersectionAttributes*)* @MyAnyHitShader, !14, void ()* @MyIntersectionShader, !11, void ()* @MyIntersectionShaderLoop, !11, void (%struct.RayPayload*)* @MyMissShader, !17}
-!11 = !{!12}
-!12 = !{i32 1, !13, !13}
-!13 = !{}
-!14 = !{!12, !15, !16}
-!15 = !{i32 2, !13, !13}
-!16 = !{i32 0, !13, !13}
-!17 = !{!12, !15}
-!18 = !{null, !"", null, !3, !19}
-!19 = !{i32 0, i64 65536}
-!20 = !{void (%struct.RayPayload*, %struct.BuiltInTriangleIntersectionAttributes*)* @MyAnyHitShader, !"MyAnyHitShader", null, null, !21}
-!21 = !{i32 8, i32 9, i32 6, i32 16, i32 7, i32 8, i32 5, !22}
-!22 = !{i32 0}
-!23 = !{void (%struct.RayPayload*, %struct.BuiltInTriangleIntersectionAttributes*)* @MyClosestHitShader, !"MyClosestHitShader", null, null, !24}
-!24 = !{i32 8, i32 10, i32 6, i32 16, i32 7, i32 8, i32 5, !22}
-!25 = !{void ()* @MyIntersectionShader, !"MyIntersectionShader", null, null, !26}
-!26 = !{i32 8, i32 8, i32 5, !22}
-!27 = !{void (%struct.RayPayload*)* @MyMissShader, !"MyMissShader", null, null, !28}
-!28 = !{i32 8, i32 11, i32 6, i32 16, i32 5, !22}
-!29 = !{void ()* @MyRayGen, !"MyRayGen", null, null, !30}
-!30 = !{i32 8, i32 7, i32 5, !22}
-!31 = !{void ()* @MyIntersectionShaderLoop, !"MyIntersectionShaderLoop", null, null, !26}
-!32 = !{%struct.AnyHitTraversalData poison}
-!33 = !{i32 0, %struct.AnyHitTraversalData poison}
-!34 = !{%struct.SystemData poison}
-!35 = !{i32 0, %struct.SystemData poison}
-!36 = !{%struct.SystemData poison}
-!37 = !{%struct.SystemData poison}
-!38 = !{%struct.DispatchSystemData poison}
-!39 = !{i32 0, %struct.DispatchSystemData poison}
-!40 = !{%struct.TraversalData poison}
-!41 = !{i32 0, %struct.TraversalData poison}
-!42 = !{%struct.SystemData poison}
-!43 = !{%struct.DispatchSystemData poison}
-!44 = !{%struct.AnyHitTraversalData poison}
-!45 = !{%struct.DispatchSystemData poison}
-!46 = !{%struct.DispatchSystemData poison}
-!47 = !{%struct.AnyHitTraversalData poison}
-!48 = !{%struct.DispatchSystemData poison}
-!49 = !{null, %struct.DispatchSystemData poison, %struct.HitData poison}
-!50 = !{i32 0, %struct.HitData poison}
-!51 = !{null, %struct.DispatchSystemData poison, %struct.HitData poison}
-!52 = !{!53, !53, i64 0}
-!53 = !{!"omnipotent char", !54, i64 0}
-!54 = !{!"Simple C/C++ TBAA"}
-!55 = !{null, %struct.RayPayload poison, %struct.BuiltInTriangleIntersectionAttributes poison}
-!56 = !{i32 0, %struct.RayPayload poison}
-!57 = !{i32 0, %struct.BuiltInTriangleIntersectionAttributes poison}
-!58 = !{%struct.RayPayload poison}
-!59 = !{%struct.RayPayload poison}
-!60 = !{%struct.BuiltInTriangleIntersectionAttributes poison}
-!61 = !{%struct.BuiltInTriangleIntersectionAttributes2 poison}
-!62 = !{i32 0, %struct.BuiltInTriangleIntersectionAttributes2 poison}
-!63 = !{i8 poison}
-!64 = !{i32 0, i8 poison}
-!65 = !{void (%struct.RayPayload*)* @MyMissShaderRecursive, !"MyMissShaderRecursive", null, null, !28}
diff --git a/llvmraytracing/test/dx/specialize-driver-shaders/specialization.ll b/llvmraytracing/test/dx/specialize-driver-shaders/specialization.ll
deleted file mode 100644
index 7149986ae8..0000000000
--- a/llvmraytracing/test/dx/specialize-driver-shaders/specialization.ll
+++ /dev/null
@@ -1,104 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-;
-; Traversal specialization tests. The Traversal functions in this module always pass through args,
-; and the module contains metadata with argument slot infos.
-; Value specialization has its own lit tests, so we focus here
-; on everything that is implemented in SpecializeDriverShadersPass, particularly regarding the argument slot handling.
-;
-; RUN: opt --verify-each -passes='specialize-driver-shaders' -S %s | FileCheck %s
-;
-; Intentionally align i64 to 64 bits so we can test specializations within types with padding.
-; i16 is aligned to 16 bits so we can test smaller-than-dword scalars.
-; f32 is aligned to 16 bits to test misaligned dword-sized scalars.
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:16-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-; Ignored prefix args: shaderAddr, levels, state, returnAddr, shaderRecIdx
-declare void @lgc.cps.jump(...)
-
-define void @SimpleArray(i32 %ret.addr, i32, [4 x i32] %args) !lgc.rt.shaderstage !{i32 6} {
-; CHECK-LABEL: define void @SimpleArray(
-; CHECK-SAME: i32 [[RET_ADDR:%.*]], i32 [[TMP0:%.*]], [4 x i32] [[ARGS:%.*]]) !lgc.rt.shaderstage [[META2:![0-9]+]] {
-; CHECK-NEXT:    [[ARGS_SPECIALIZED:%.*]] = insertvalue [4 x i32] [[ARGS]], i32 42, 1
-; CHECK-NEXT:    [[TMP3:%.*]] = freeze i32 poison
-; CHECK-NEXT:    [[ARGS_SPECIALIZED1:%.*]] = insertvalue [4 x i32] [[ARGS_SPECIALIZED]], i32 [[TMP3]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = freeze i32 poison
-; CHECK-NEXT:    [[ARGS_SPECIALIZED2:%.*]] = insertvalue [4 x i32] [[ARGS_SPECIALIZED1]], i32 [[TMP4]], 3
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, [4 x i32] [[ARGS_SPECIALIZED2]])
-; CHECK-NEXT:    unreachable
-;
-  call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, [4 x i32] %args)
-  unreachable
-}
-
-define void @SimpleScalars(i32 %ret.addr, i32, i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3) !lgc.rt.shaderstage !{i32 6} {
-; CHECK-LABEL: define void @SimpleScalars(
-; CHECK-SAME: i32 [[RET_ADDR:%.*]], i32 [[TMP0:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 [[ARG3:%.*]]) !lgc.rt.shaderstage [[META2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = freeze i32 poison
-; CHECK-NEXT:    [[TMP4:%.*]] = freeze i32 poison
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 [[ARG0]], i32 42, i32 [[TMP3]], i32 [[TMP4]])
-; CHECK-NEXT:    unreachable
-;
-  call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3)
-  unreachable
-}
-
-define void @I16s(i32 %ret.addr, i32, i16 %arg0, i16 %arg1, i16 %arg2, i16 %arg3) !lgc.rt.shaderstage !{i32 6} {
-; CHECK-LABEL: define void @I16s(
-; CHECK-SAME: i32 [[RET_ADDR:%.*]], i32 [[TMP0:%.*]], i16 [[ARG0:%.*]], i16 [[ARG1:%.*]], i16 [[ARG2:%.*]], i16 [[ARG3:%.*]]) !lgc.rt.shaderstage [[META2]] {
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i16 [[ARG0]], i16 [[ARG1]], i16 [[ARG2]], i16 [[ARG3]])
-; CHECK-NEXT:    unreachable
-;
-  call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i16 %arg0, i16 %arg1, i16 %arg2, i16 %arg3)
-  unreachable
-}
-
-; Test that even if specialization of i16 arguments is ignored, we still specialize i32s.
-define void @MixedI16I32s(i32 %ret.addr, i32, i16 %arg0, i32 %arg1, i16 %arg2, i32 %arg3) !lgc.rt.shaderstage !{i32 6} {
-; CHECK-LABEL: define void @MixedI16I32s(
-; CHECK-SAME: i32 [[RET_ADDR:%.*]], i32 [[TMP0:%.*]], i16 [[ARG0:%.*]], i32 [[ARG1:%.*]], i16 [[ARG2:%.*]], i32 [[ARG3:%.*]]) !lgc.rt.shaderstage [[META2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = freeze i32 poison
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i16 [[ARG0]], i32 42, i16 [[ARG2]], i32 [[TMP3]])
-; CHECK-NEXT:    unreachable
-;
-  call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, i16 %arg0, i32 %arg1, i16 %arg2, i32 %arg3)
-  unreachable
-}
-
-; Test that specializing an arg slot that occupies a full misaligned dword in the argument isn't supported
-; In this test, the first contained float scalar is specialized, because it is dword-aligned,
-; but the second isn't, because it is not aligned. This is because i16 and float use 16-bit alignment in this test.
-define void @MisalignedDwords(i32 %ret.addr, i32, { i32, float, i16, float, i32 } %args) !lgc.rt.shaderstage !{i32 6} {
-; CHECK-LABEL: define void @MisalignedDwords(
-; CHECK-SAME: i32 [[RET_ADDR:%.*]], i32 [[TMP0:%.*]], { i32, float, i16, float, i32 } [[ARGS:%.*]]) !lgc.rt.shaderstage [[META2]] {
-; CHECK-NEXT:    [[ARGS_SPECIALIZED:%.*]] = insertvalue { i32, float, i16, float, i32 } [[ARGS]], float 0x36F5000000000000, 1
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, { i32, float, i16, float, i32 } [[ARGS_SPECIALIZED]])
-; CHECK-NEXT:    unreachable
-;
-  call void (...) @lgc.cps.jump(i32 poison, i32 poison, i32 poison, i32 poison, { i32, float, i16, float, i32 } %args)
-  unreachable
-}
-
-!lgc.cps.module = !{}
-!lgc.rt.specialize.driver.shaders.state = !{!0}
-; Disable analysis, so traversal variants that we can't handle don't affect other functions in this test.
-!lgc.rt.specialize.driver.shaders.opts = !{!1}
-
-; Numerical status values:
-;
-;    Status        | Value
-;    =====================
-;    Dynamic       |     0
-;    Constant      |     1
-;    UndefOrPoison |     2
-;    Preserve      |     3
-;
-
-!0 = !{
-; Status |        [Constant] | Arg slot idx
-  i32 0,   i32           0, ;            0
-  i32 1,   i32          42, ;            1
-  i32 2,   i32           0, ;            2
-  i32 3,   i32           0, ;            3
-  i32 0,   i32           0  ;            4
-}
-!1 = !{i32 0, i32 1}
diff --git a/llvmraytracing/test/dx/stats-report-sizes.ll b/llvmraytracing/test/dx/stats-report-sizes.ll
deleted file mode 100644
index a534787078..0000000000
--- a/llvmraytracing/test/dx/stats-report-sizes.ll
+++ /dev/null
@@ -1,58 +0,0 @@
-; NOTE: Do not autogenerate
-; RUN: opt --report-cont-state-sizes --verify-each -passes='continuations-stats-report,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error 2>&1 | FileCheck %s --check-prefix=REPORT-CONT-SIZES
-; RUN: opt --report-payload-register-sizes=max --verify-each -passes='continuations-stats-report,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error 2>&1 | FileCheck %s --check-prefix=REPORT-PAYLOAD-SIZES
-; RUN: opt --report-system-data-sizes      --verify-each -passes='continuations-stats-report,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error 2>&1 | FileCheck %s --check-prefix=REPORT-SYSTEM-DATA-SIZES
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-%struct.DispatchSystemData = type { i32 }
-%struct.CHSSystemData = type { [100 x i32] }
-
-declare i32 @continuation.initialContinuationStackPtr()
-declare i32 @_cont_GetContinuationStackAddr()
-declare i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
-declare void @lgc.cps.jump(...)
-
-; REPORT-CONT-SIZES: Continuation state size of "RayGen" (raygeneration): 108 bytes
-; REPORT-PAYLOAD-SIZES: Incoming and max outgoing payload VGPR size of "RayGen" (raygeneration): 7 and 6 dwords
-define void @RayGen(i32 %cspInit, i32 %dummyRetAddr, %struct.DispatchSystemData %0) !continuation.entry !0 !continuation !3 !continuation.state !5 !continuation.registercount !7 !lgc.rt.shaderstage !12 {
-  %ptr = alloca i32, align 4
-  %cspInit1 = call i32 @continuation.initialContinuationStackPtr()
-  store i32 %cspInit1, i32* %ptr
-  %csp = load i32, ptr %ptr, align 4
-  call void (...) @lgc.cps.jump(i32 2, i32 poison, i32 %csp, i32 poison), !continuation.registercount !6
-  ret void
-}
-
-; This is needed as fake continuation of RayGen, because we only report continuation state sizes
-; if we find a continuation function using !continuation metadata.
-; REPORT-SYSTEM-DATA-SIZES-DAG: Incoming system data of "RayGen.resume.0" (raygeneration) is "struct.DispatchSystemData", size: 4 bytes
-define void @RayGen.resume.0(i32 %cspInit, i32 %0, %struct.DispatchSystemData %1) !continuation !3 !lgc.rt.shaderstage !12 {
-  ret void
-}
-
-; REPORT-PAYLOAD-SIZES: Incoming and max outgoing payload VGPR size of "CHS" (closesthit): 8 and 9 dwords
-; REPORT-SYSTEM-DATA-SIZES-DAG: Incoming system data of "CHS" (closesthit) is "struct.CHSSystemData", size: 400 bytes
-define void @CHS(i32 %cspInit, i32 %returnAddr, %struct.CHSSystemData %0) !continuation !14 !continuation.registercount !8 !lgc.rt.shaderstage !13 {
-  call void ( ...) @lgc.cps.jump(i32 2, i32 poison, i32 poison, i32 poison), !continuation.registercount !9
-  ret void
-}
-
-!dx.entryPoints = !{!1, !10}
-!continuation.stackAddrspace = !{!4}
-
-!0 = !{}
-!1 = !{void ()* @RayGen, !"RayGen", null, null, !2}
-!2 = !{i32 8, i32 7}
-!3 = !{void ()* @RayGen}
-!4 = !{i32 21}
-!5 = !{i32 108}
-!6 = !{i32 6}
-!7 = !{i32 7}
-!8 = !{i32 8}
-!9 = !{i32 9}
-!10 = !{void ()* @CHS, !"CHS", null, null, !11}
-!11 = !{i32 8, i32 10}
-!12 = !{i32 0}
-!13 = !{i32 3}
-!14 = !{ptr @CHS}
diff --git a/llvmraytracing/test/dx/unnamed-type-intrinsics.ll b/llvmraytracing/test/dx/unnamed-type-intrinsics.ll
deleted file mode 100644
index 60b90d3114..0000000000
--- a/llvmraytracing/test/dx/unnamed-type-intrinsics.ll
+++ /dev/null
@@ -1,494 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
-
-; Check that using unnamed types works well with generating intrinsic names
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-; struct.DispatchSystemData
-%0 = type { <3 x i32> }
-; struct.TraversalData
-%1 = type { %2, %struct.HitData, <3 x float>, <3 x float>, float, i32 }
-; struct.SystemData
-%2 = type { %0 }
-; struct.AnyHitTraversalData
-%3 = type { %1, %struct.HitData }
-%dx.types.Handle = type { i8* }
-%struct.HitData = type { <3 x float>, <3 x float>, float, i32 }
-%struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
-%struct.RayPayload = type { <4 x float> }
-%dx.types.ResourceProperties = type { i32, i32 }
-%struct.RaytracingAccelerationStructure = type { i32 }
-%"class.RWTexture2D<vector<float, 4> >" = type { <4 x float> }
-
-@"\01?Scene@@3URaytracingAccelerationStructure@@A" = external constant %dx.types.Handle, align 4
-@"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" = external constant %dx.types.Handle, align 4
-
-declare i32 @_cont_GetContinuationStackAddr() #0
-
-declare %0 @_AmdAwaitTraversal(i32, %1) #0
-
-declare %0 @_AmdAwaitShader(i32, %0) #0
-
-declare %3 @_AmdAwaitAnyHit(i32, %3, float, i32) #0
-
-declare !pointeetys !17 %struct.HitData @_cont_GetCandidateState(%3*) #0
-
-declare !pointeetys !19 %struct.HitData @_cont_GetCommittedState(%2*) #0
-
-declare !pointeetys !21 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%2*) #0
-
-declare !pointeetys !22 void @_cont_SetTriangleHitAttributes(%2*, %struct.BuiltInTriangleIntersectionAttributes) #0
-
-declare !pointeetys !23 i32 @_cont_GetLocalRootIndex(%0*)
-
-declare !pointeetys !25 i1 @_cont_IsEndSearch(%1*) #0
-
-declare !pointeetys !27 i32 @_cont_HitKind(%2*) #0
-
-; Function Attrs: nounwind
-declare i32 @_AmdGetResumePointAddr() #1
-
-; Function Attrs: nounwind
-declare !pointeetys !28 void @_AmdRestoreSystemData(%0*) #1
-
-; Function Attrs: nounwind
-declare !pointeetys !29 void @_AmdRestoreSystemDataAnyHit(%3*) #1
-
-; Function Attrs: nounwind
-declare !pointeetys !28 void @_cont_AcceptHitAndEndSearch(%0* nocapture readnone) #1
-
-; Function Attrs: nounwind
-declare !pointeetys !29 void @_cont_AcceptHit(%3* nocapture readnone) #1
-
-; Function Attrs: nounwind
-declare !pointeetys !28 void @_cont_IgnoreHit(%0* nocapture readnone) #1
-
-; Function Attrs: nounwind
-declare !pointeetys !29 void @_AmdAcceptHitAttributes(%3* nocapture readnone) #1
-
-define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !pointeetys !{%0 poison} {
-  ret void
-}
-
-define void @_cont_TraceRay(%0* %data, i64 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, float %6, float %7, float %8, float %9, float %10, float %11, float %12, float %13) #0 !pointeetys !30 {
-  %dis_data = load %0, %0* %data, align 4
-  %sys_data = insertvalue %2 undef, %0 %dis_data, 0
-  %trav_data = insertvalue %1 undef, %2 %sys_data, 0
-  %addr = call i32 @_AmdGetResumePointAddr() #3
-  %trav_data2 = insertvalue %1 %trav_data, i32 %addr, 5
-  %newdata = call %0 @_AmdAwaitTraversal(i32 4, %1 %trav_data2)
-  store %0 %newdata, %0* %data, align 4
-  call void @_AmdRestoreSystemData(%0* %data)
-  ret void
-}
-
-define void @_cont_CallShader(%0* %data, i32 %0) #0 !pointeetys !31 {
-  %dis_data = load %0, %0* %data, align 4
-  %newdata = call %0 @_AmdAwaitShader(i32 2, %0 %dis_data)
-  store %0 %newdata, %0* %data, align 4
-  call void @_AmdRestoreSystemData(%0* %data)
-  ret void
-}
-
-define i1 @_cont_ReportHit(%3* %data, float %t, i32 %hitKind) #0 !pointeetys !32 {
-  %origTPtr = getelementptr inbounds %3, %3* %data, i32 0, i32 0, i32 4
-  %origT = load float, float* %origTPtr, align 4
-  %isNoHit = fcmp fast uge float %t, %origT
-  br i1 %isNoHit, label %isEnd, label %callAHit
-
-callAHit:                                         ; preds = %0
-  %trav_data = load %3, %3* %data, align 4
-  %newdata = call %3 @_AmdAwaitAnyHit(i32 3, %3 %trav_data, float %t, i32 %hitKind)
-  store %3 %newdata, %3* %data, align 4
-  call void @_AmdRestoreSystemDataAnyHit(%3* %data)
-  ret i1 true
-
-isEnd:                                            ; preds = %0
-  ; Call AcceptHitAttributes, just to simulate it
-  call void @_AmdAcceptHitAttributes(%3* %data)
-  ret i1 false
-}
-
-define <3 x i32> @_cont_DispatchRaysIndex3(%0* %data) !pointeetys !33 {
-  %resPtr.1 = getelementptr %0, %0* %data, i32 0, i32 0, i32 0
-  %res.1 = load i32, i32* %resPtr.1, align 4
-  %resPtr.2 = getelementptr %0, %0* %data, i32 0, i32 0, i32 1
-  %res.2 = load i32, i32* %resPtr.2, align 4
-  %resPtr.3 = getelementptr %0, %0* %data, i32 0, i32 0, i32 2
-  %res.3 = load i32, i32* %resPtr.3, align 4
-  %val.0 = insertelement <3 x i32> undef, i32 %res.1, i32 0
-  %val.1 = insertelement <3 x i32> %val.0, i32 %res.2, i32 1
-  %val.2 = insertelement <3 x i32> %val.1, i32 %res.3, i32 2
-  ret <3 x i32> %val.2
-}
-
-define <3 x float> @_cont_ObjectRayOrigin3(%0* nocapture readnone %data, %struct.HitData* %hitData) !pointeetys !34 {
-  %resPtr.1 = getelementptr %struct.HitData, %struct.HitData* %hitData, i32 0, i32 0, i32 0
-  %res.1 = load float, float* %resPtr.1, align 4
-  %resPtr.2 = getelementptr %struct.HitData, %struct.HitData* %hitData, i32 0, i32 0, i32 1
-  %res.2 = load float, float* %resPtr.2, align 4
-  %resPtr.3 = getelementptr %struct.HitData, %struct.HitData* %hitData, i32 0, i32 0, i32 2
-  %res.3 = load float, float* %resPtr.3, align 4
-  %val.0 = insertelement <3 x float> undef, float %res.1, i32 0
-  %val.1 = insertelement <3 x float> %val.0, float %res.2, i32 1
-  %val.2 = insertelement <3 x float> %val.1, float %res.3, i32 2
-  ret <3 x float> %val.2
-}
-
-define <3 x float> @_cont_ObjectRayDirection3(%0* nocapture readnone %data, %struct.HitData* %hitData) !pointeetys !34 {
-  %resPtr.1 = getelementptr %struct.HitData, %struct.HitData* %hitData, i32 0, i32 1, i32 0
-  %res.1 = load float, float* %resPtr.1, align 4
-  %resPtr.2 = getelementptr %struct.HitData, %struct.HitData* %hitData, i32 0, i32 1, i32 1
-  %res.2 = load float, float* %resPtr.2, align 4
-  %resPtr.3 = getelementptr %struct.HitData, %struct.HitData* %hitData, i32 0, i32 1, i32 2
-  %res.3 = load float, float* %resPtr.3, align 4
-  %val.0 = insertelement <3 x float> undef, float %res.1, i32 0
-  %val.1 = insertelement <3 x float> %val.0, float %res.2, i32 1
-  %val.2 = insertelement <3 x float> %val.1, float %res.3, i32 2
-  ret <3 x float> %val.2
-}
-
-define float @_cont_RayTCurrent(%0* nocapture readnone %data, %struct.HitData* %hitData) !pointeetys !36 {
-  %resPtr = getelementptr %struct.HitData, %struct.HitData* %hitData, i32 0, i32 2
-  %res = load float, float* %resPtr, align 4
-  ret float %res
-}
-
-; Function Attrs: nounwind
-define void @MyRayGen() #2 {
-  %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
-  %2 = load %dx.types.Handle, %dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
-  %3 = alloca %struct.RayPayload, align 4
-  %4 = bitcast %struct.RayPayload* %3 to i8*
-  call void @llvm.lifetime.start.p0i8(i64 16, i8* %4) #1
-  %5 = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %3, i32 0, i32 0
-  store <4 x float> zeroinitializer, <4 x float>* %5, align 4, !tbaa !37
-  %6 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1)
-  %7 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %6, %dx.types.ResourceProperties { i32 16, i32 0 })
-  call void @dx.op.traceRay.struct.RayPayload(i32 157, %dx.types.Handle %7, i32 16, i32 -1, i32 0, i32 1, i32 0, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0x3F50624DE0000000, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+04, %struct.RayPayload* nonnull %3)
-  %8 = load <4 x float>, <4 x float>* %5, align 4, !tbaa !37
-  %9 = call i32 @dx.op.dispatchRaysIndex.i32(i32 145, i8 0)
-  %10 = call i32 @dx.op.dispatchRaysIndex.i32(i32 145, i8 1)
-  %11 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %2)
-  %12 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %11, %dx.types.ResourceProperties { i32 4098, i32 1033 })
-  %13 = extractelement <4 x float> %8, i64 0
-  %14 = extractelement <4 x float> %8, i64 1
-  %15 = extractelement <4 x float> %8, i64 2
-  %16 = extractelement <4 x float> %8, i64 3
-  call void @dx.op.textureStore.f32(i32 67, %dx.types.Handle %12, i32 %9, i32 %10, i32 undef, float %13, float %14, float %15, float %16, i8 15)
-  call void @llvm.lifetime.end.p0i8(i64 16, i8* %4) #1
-  ret void
-}
-
-; Function Attrs: nounwind
-define void @MyClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct.BuiltInTriangleIntersectionAttributes* nocapture readonly %attr) #2 !pointeetys !40 {
-  %1 = getelementptr inbounds %struct.BuiltInTriangleIntersectionAttributes, %struct.BuiltInTriangleIntersectionAttributes* %attr, i32 0, i32 0
-  %2 = load <2 x float>, <2 x float>* %1, align 4
-  %3 = extractelement <2 x float> %2, i32 0
-  %4 = fsub fast float 1.000000e+00, %3
-  %5 = extractelement <2 x float> %2, i32 1
-  %6 = fsub fast float %4, %5
-  %7 = insertelement <4 x float> undef, float %6, i64 0
-  %8 = insertelement <4 x float> %7, float %3, i64 1
-  %9 = insertelement <4 x float> %8, float %5, i64 2
-  %10 = insertelement <4 x float> %9, float 1.000000e+00, i64 3
-  %11 = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %payload, i32 0, i32 0
-  store <4 x float> %10, <4 x float>* %11, align 4
-  ret void
-}
-
-; Function Attrs: nounwind
-declare !pointeetys !43 void @dx.op.traceRay.struct.RayPayload(i32, %dx.types.Handle, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, %struct.RayPayload*) #1
-
-; Function Attrs: nounwind
-declare void @dx.op.textureStore.f32(i32, %dx.types.Handle, i32, i32, i32, float, float, float, float, i8) #1
-
-; Function Attrs: nounwind memory(none)
-declare i32 @dx.op.dispatchRaysIndex.i32(i32, i8) #3
-
-; Function Attrs: nounwind memory(none)
-declare float @dx.op.objectRayDirection.f32(i32, i8) #3
-
-; Function Attrs: nounwind memory(none)
-declare float @dx.op.objectRayOrigin.f32(i32, i8) #3
-
-; Function Attrs: nounwind memory(read)
-declare float @dx.op.rayTCurrent.f32(i32) #4
-
-declare void @dx.op.acceptHitAndEndSearch(i32) #0
-
-declare void @dx.op.ignoreHit(i32) #0
-
-; Function Attrs: nounwind
-declare !pointeetys !44 i1 @dx.op.reportHit.struct.BuiltInTriangleIntersectionAttributes(i32, float, i32, %struct.BuiltInTriangleIntersectionAttributes*) #1
-
-; Function Attrs: nounwind memory(none)
-declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #3
-
-; Function Attrs: nounwind memory(read)
-declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.types.Handle) #4
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare !pointeetys !45 void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #5
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare !pointeetys !45 void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #5
-
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-attributes #2 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind memory(none) }
-attributes #4 = { nounwind memory(read) }
-attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
-
-!llvm.ident = !{!0}
-!dx.version = !{!1}
-!dx.valver = !{!1}
-!dx.shaderModel = !{!2}
-!dx.resources = !{!3}
-!dx.typeAnnotations = !{}
-!dx.entryPoints = !{!10, !12, !15}
-
-!0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"}
-!1 = !{i32 1, i32 6}
-!2 = !{!"lib", i32 6, i32 6}
-!3 = !{!4, !7, null, null}
-!4 = !{!5}
-!5 = !{i32 0, %struct.RaytracingAccelerationStructure* bitcast (%dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A" to %struct.RaytracingAccelerationStructure*), !"Scene", i32 0, i32 0, i32 1, i32 16, i32 0, !6}
-!6 = !{i32 0, i32 4}
-!7 = !{!8}
-!8 = !{i32 0, %"class.RWTexture2D<vector<float, 4> >"* bitcast (%dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" to %"class.RWTexture2D<vector<float, 4> >"*), !"RenderTarget", i32 0, i32 0, i32 1, i32 2, i1 false, i1 false, i1 false, !9}
-!9 = !{i32 0, i32 9}
-!10 = !{null, !"", null, !3, !11}
-!11 = !{i32 0, i64 65536}
-!12 = !{void (%struct.RayPayload*, %struct.BuiltInTriangleIntersectionAttributes*)* @MyClosestHit, !"MyClosestHit", null, null, !13}
-!13 = !{i32 8, i32 10, i32 6, i32 16, i32 7, i32 8, i32 5, !14}
-!14 = !{i32 0}
-!15 = !{void ()* @MyRayGen, !"MyRayGen", null, null, !16}
-!16 = !{i32 8, i32 7, i32 5, !14}
-!17 = !{%3 poison}
-!18 = !{i32 0, %3 poison}
-!19 = !{%2 poison}
-!20 = !{i32 0, %2 poison}
-!21 = !{%2 poison}
-!22 = !{%2 poison}
-!23 = !{%0 poison}
-!24 = !{i32 0, %0 poison}
-!25 = !{%1 poison}
-!26 = !{i32 0, %1 poison}
-!27 = !{%2 poison}
-!28 = !{%0 poison}
-!29 = !{%3 poison}
-!30 = !{%0 poison}
-!31 = !{%0 poison}
-!32 = !{%3 poison}
-!33 = !{%0 poison}
-!34 = !{null, %0 poison, %struct.HitData poison}
-!35 = !{i32 0, %struct.HitData poison}
-!36 = !{null, %0 poison, %struct.HitData poison}
-!37 = !{!38, !38, i64 0}
-!38 = !{!"omnipotent char", !39, i64 0}
-!39 = !{!"Simple C/C++ TBAA"}
-!40 = !{null, %struct.RayPayload poison, %struct.BuiltInTriangleIntersectionAttributes poison}
-!41 = !{i32 0, %struct.RayPayload poison}
-!42 = !{i32 0, %struct.BuiltInTriangleIntersectionAttributes poison}
-!43 = !{%struct.RayPayload poison}
-!44 = !{%struct.BuiltInTriangleIntersectionAttributes poison}
-!45 = !{i8 poison}
-; LOWERRAYTRACINGPIPELINE-LABEL: define <3 x i32> @_cont_DispatchRaysIndex3(
-; LOWERRAYTRACINGPIPELINE-SAME: ptr [[DATA:%.*]]) {
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[RESPTR_1:%.*]] = getelementptr [[TMP0:%.*]], ptr [[DATA]], i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[RES_1:%.*]] = load i32, ptr [[RESPTR_1]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[RESPTR_2:%.*]] = getelementptr [[TMP0]], ptr [[DATA]], i32 0, i32 0, i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[RES_2:%.*]] = load i32, ptr [[RESPTR_2]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[RESPTR_3:%.*]] = getelementptr [[TMP0]], ptr [[DATA]], i32 0, i32 0, i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[RES_3:%.*]] = load i32, ptr [[RESPTR_3]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[VAL_0:%.*]] = insertelement <3 x i32> undef, i32 [[RES_1]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[VAL_1:%.*]] = insertelement <3 x i32> [[VAL_0]], i32 [[RES_2]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[VAL_2:%.*]] = insertelement <3 x i32> [[VAL_1]], i32 [[RES_3]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret <3 x i32> [[VAL_2]]
-;
-;
-; LOWERRAYTRACINGPIPELINE-LABEL: define <3 x float> @_cont_ObjectRayOrigin3(
-; LOWERRAYTRACINGPIPELINE-SAME: ptr nocapture readnone [[DATA:%.*]], ptr [[HITDATA:%.*]]) {
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[RESPTR_1:%.*]] = getelementptr [[STRUCT_HITDATA:%.*]], ptr [[HITDATA]], i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[RES_1:%.*]] = load float, ptr [[RESPTR_1]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[RESPTR_2:%.*]] = getelementptr [[STRUCT_HITDATA]], ptr [[HITDATA]], i32 0, i32 0, i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[RES_2:%.*]] = load float, ptr [[RESPTR_2]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[RESPTR_3:%.*]] = getelementptr [[STRUCT_HITDATA]], ptr [[HITDATA]], i32 0, i32 0, i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[RES_3:%.*]] = load float, ptr [[RESPTR_3]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[VAL_0:%.*]] = insertelement <3 x float> undef, float [[RES_1]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[VAL_1:%.*]] = insertelement <3 x float> [[VAL_0]], float [[RES_2]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[VAL_2:%.*]] = insertelement <3 x float> [[VAL_1]], float [[RES_3]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret <3 x float> [[VAL_2]]
-;
-;
-; LOWERRAYTRACINGPIPELINE-LABEL: define <3 x float> @_cont_ObjectRayDirection3(
-; LOWERRAYTRACINGPIPELINE-SAME: ptr nocapture readnone [[DATA:%.*]], ptr [[HITDATA:%.*]]) {
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[RESPTR_1:%.*]] = getelementptr [[STRUCT_HITDATA:%.*]], ptr [[HITDATA]], i32 0, i32 1, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[RES_1:%.*]] = load float, ptr [[RESPTR_1]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[RESPTR_2:%.*]] = getelementptr [[STRUCT_HITDATA]], ptr [[HITDATA]], i32 0, i32 1, i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[RES_2:%.*]] = load float, ptr [[RESPTR_2]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[RESPTR_3:%.*]] = getelementptr [[STRUCT_HITDATA]], ptr [[HITDATA]], i32 0, i32 1, i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[RES_3:%.*]] = load float, ptr [[RESPTR_3]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[VAL_0:%.*]] = insertelement <3 x float> undef, float [[RES_1]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[VAL_1:%.*]] = insertelement <3 x float> [[VAL_0]], float [[RES_2]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[VAL_2:%.*]] = insertelement <3 x float> [[VAL_1]], float [[RES_3]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret <3 x float> [[VAL_2]]
-;
-;
-; LOWERRAYTRACINGPIPELINE-LABEL: define float @_cont_RayTCurrent(
-; LOWERRAYTRACINGPIPELINE-SAME: ptr nocapture readnone [[DATA:%.*]], ptr [[HITDATA:%.*]]) {
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[RESPTR:%.*]] = getelementptr [[STRUCT_HITDATA:%.*]], ptr [[HITDATA]], i32 0, i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[RES:%.*]] = load float, ptr [[RESPTR]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret float [[RES]]
-;
-;
-; LOWERRAYTRACINGPIPELINE-LABEL: define void @MyRayGen(
-; LOWERRAYTRACINGPIPELINE-SAME: i32 [[RETURNADDR:%.*]], [[TMP0:%.*]] [[TMP0]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META14:![0-9]+]] !continuation.entry [[META20:![0-9]+]] !continuation.registercount [[META14]] !continuation [[META21:![0-9]+]] {
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[TMP0]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[TMP0]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = bitcast ptr [[TMP4]] to ptr
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP5]]) #[[ATTR1:[0-9]+]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    store <4 x float> zeroinitializer, ptr [[TMP6]], align 4, !tbaa [[TBAA22:![0-9]+]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP7]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP8]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[DIS_DATA_I:%.*]] = load [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYS_DATA_I:%.*]] = insertvalue [[TMP2]] undef, [[TMP0]] [[DIS_DATA_I]], 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA_I:%.*]] = insertvalue [[TMP1:%.*]] undef, [[TMP2]] [[SYS_DATA_I]], 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[ADDR_I:%.*]] = call i32 @_AmdGetResumePointAddr() #[[ATTR3:[0-9]+]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[TMP1]] [[TRAV_DATA_I]], i32 [[ADDR_I]], 5
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP10]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP13]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP14]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP16]], ptr [[TMP12]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP18]], ptr [[TMP15]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP20]], ptr [[TMP25]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = call { [[TMP0]], [32 x i32], [10 x i32] } (...) @lgc.cps.await__sl_s_sa32i32a10i32s(i32 4, i32 8, i32 poison, [[TMP1]] [[TRAV_DATA2_I]], [16 x i32] poison, [10 x i32] [[TMP21]]), !continuation.registercount [[META18:![0-9]+]], !continuation.returnedRegistercount [[META18]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = extractvalue { [[TMP0]], [32 x i32], [10 x i32] } [[TMP28]], 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [10 x i32] [[TMP24]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP38:%.*]] = freeze [[STRUCT_RAYPAYLOAD]] poison
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_RAYPAYLOAD]] [[TMP38]], ptr [[TMP4]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP26]], ptr [[TMP23]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP36]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP29]], ptr [[TMP27]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[TMP36]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP31:%.*]] = load i32, ptr [[TMP37]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP31]], ptr [[TMP30]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP36]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP34]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP33]], ptr [[TMP32]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = extractvalue { [[TMP0]], [32 x i32], [10 x i32] } [[TMP28]], 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[TMP0]] [[TMP22]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
-; LOWERRAYTRACINGPIPELINE-NEXT:    br label [[DOTSPLIT:%.*]]
-; LOWERRAYTRACINGPIPELINE:       .split:
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP49:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA22]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP50:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP50]], i8 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP51:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP51]], i8 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP40:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP53:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP40]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 })
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP54:%.*]] = extractelement <4 x float> [[TMP49]], i64 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP55:%.*]] = extractelement <4 x float> [[TMP49]], i64 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP56:%.*]] = extractelement <4 x float> [[TMP49]], i64 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP57:%.*]] = extractelement <4 x float> [[TMP49]], i64 3
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @dx.op.textureStore.f32(i32 67, [[DX_TYPES_HANDLE]] [[TMP53]], i32 [[EXTRACT]], i32 [[EXTRACT1]], i32 undef, float [[TMP54]], float [[TMP55]], float [[TMP56]], float [[TMP57]], i8 15)
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[TMP5]]) #[[ATTR1]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @lgc.cps.complete()
-; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
-;
-;
-; LOWERRAYTRACINGPIPELINE-LABEL: define void @MyClosestHit(
-; LOWERRAYTRACINGPIPELINE-SAME: i32 [[RETURNADDR:%.*]], [[TMP2:%.*]] [[TMP0:%.*]], [32 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META25:![0-9]+]] !continuation.registercount [[META18]] !continuation [[META26:![0-9]+]] {
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[TMP2]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[HITATTRS:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [10 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[TMP2]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[TMP2]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = call i32 @_cont_GetLocalRootIndex(ptr [[TMP4]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP9]], ptr [[TMP6]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP8]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP12]], ptr [[TMP10]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP19]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP14]], ptr [[TMP13]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP20]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP16]], ptr [[TMP15]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP17]], ptr [[TMP2]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP2]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP18]], ptr [[HITATTRS]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[HITATTRS]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP23]], ptr [[TMP21]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[TMP5]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[HITATTRS]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[TMP24]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = extractelement <2 x float> [[TMP25]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = fsub fast float 1.000000e+00, [[TMP26]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = extractelement <2 x float> [[TMP25]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP29:%.*]] = fsub fast float [[TMP27]], [[TMP28]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP30:%.*]] = insertelement <4 x float> undef, float [[TMP29]], i64 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[TMP26]], i64 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> [[TMP31]], float [[TMP28]], i64 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = insertelement <4 x float> [[TMP32]], float 1.000000e+00, i64 3
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    store <4 x float> [[TMP33]], ptr [[TMP34]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP35]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP38]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP35]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP41:%.*]] = load i32, ptr [[TMP39]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP41]], ptr [[TMP36]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[TMP36]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP43:%.*]] = load i32, ptr [[TMP42]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP43]], ptr [[TMP40]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, ptr [[TMP36]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP45:%.*]] = load i32, ptr [[TMP44]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP45]], ptr [[TMP48]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[TMP2]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP47:%.*]] = load [[TMP0]], ptr [[TMP46]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP49:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, i32 poison, i32 poison, [[TMP0]] [[TMP47]], [32 x i32] poison, [10 x i32] [[TMP49]]), !continuation.registercount [[META18]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
-;
diff --git a/llvmraytracing/test/dx/wrong-system-data.ll b/llvmraytracing/test/dx/wrong-system-data.ll
deleted file mode 100644
index 79cae99e55..0000000000
--- a/llvmraytracing/test/dx/wrong-system-data.ll
+++ /dev/null
@@ -1,228 +0,0 @@
-; NOTE: Do not autogenerate
-; RUN: not --crash opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error 2>&1 | FileCheck %s
-
-; CHECK: Invalid system data struct: Did not contain the needed struct type
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-%dx.types.Handle = type { i8* }
-%struct.TraversalData = type { %struct.SystemData }
-%struct.SystemData = type { i32 }
-%struct.DispatchSystemData = type { i32 }
-%struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
-%struct.RayPayload = type { <4 x float> }
-%dx.types.ResourceProperties = type { i32, i32 }
-%struct.RaytracingAccelerationStructure = type { i32 }
-%"class.RWTexture2D<vector<float, 4> >" = type { <4 x float> }
-
-@"\01?Scene@@3URaytracingAccelerationStructure@@A" = external constant %dx.types.Handle, align 4
-@"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" = external constant %dx.types.Handle, align 4
-
-declare i64 @_AmdGetTraversalAddr() #0
-
-declare !pointeetys !31 %struct.TraversalData @_AmdAnyHit(i64, %struct.TraversalData*) #0
-
-declare i32 @_cont_GetContinuationStackAddr() #0
-
-declare !pointeetys !33 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*) #0
-declare !pointeetys !33 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
-
-declare %struct.DispatchSystemData @_AmdTraversal(%struct.TraversalData) #0
-
-declare !pointeetys !35 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData*) #0
-
-declare !pointeetys !37 void @_cont_SetTriangleHitAttributes(%struct.SystemData*, %struct.BuiltInTriangleIntersectionAttributes) #0
-
-declare !pointeetys !38 i1 @_cont_IsEndSearch(%struct.TraversalData*) #0
-
-declare !pointeetys !39 i32 @_cont_HitKind(%struct.SystemData*) #0
-
-define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !pointeetys !{%struct.DispatchSystemData poison} {
-  ret void
-}
-
-define void @_cont_TraceRay(%struct.DispatchSystemData* %data, i64 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, float %6, float %7, float %8, float %9, float %10, float %11, float %12, float %13) #0 !pointeetys !40 {
-  %sys_data = insertvalue %struct.SystemData undef, i32 1, 0
-  %trav_data = insertvalue %struct.TraversalData undef, %struct.SystemData %sys_data, 0
-  %newdata = call %struct.DispatchSystemData @_AmdTraversal(%struct.TraversalData %trav_data)
-  store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4
-  ret void
-}
-
-define i1 @_cont_ReportHit(%struct.TraversalData* %data, float %0, i32 %1) #0 !pointeetys !41 {
-  ret i1 true
-}
-
-; Function Attrs: nounwind
-define void @"\01?MyRaygenShader@@YAXXZ"() #1 {
-  %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
-  %2 = load %dx.types.Handle, %dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
-  %3 = alloca %struct.RayPayload, align 4
-  %4 = bitcast %struct.RayPayload* %3 to i8*
-  %5 = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %3, i32 0, i32 0
-  store <4 x float> zeroinitializer, <4 x float>* %5, align 4, !tbaa !42
-  %6 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1)
-  %7 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %6, %dx.types.ResourceProperties { i32 16, i32 0 })
-  call void @dx.op.traceRay.struct.RayPayload(i32 157, %dx.types.Handle %7, i32 16, i32 -1, i32 0, i32 1, i32 0, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0x3F50624DE0000000, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+04, %struct.RayPayload* nonnull %3)
-  ret void
-}
-
-; Function Attrs: nounwind
-define void @"\01?MyClosestHitShader@@YAXURayPayload@@UBuiltInTriangleIntersectionAttributes@@@Z"(%struct.RayPayload* noalias nocapture %payload, %struct.BuiltInTriangleIntersectionAttributes* nocapture readonly %attr) #1 !pointeetys !45 {
-  %1 = getelementptr inbounds %struct.BuiltInTriangleIntersectionAttributes, %struct.BuiltInTriangleIntersectionAttributes* %attr, i32 0, i32 0
-  %2 = load <2 x float>, <2 x float>* %1, align 4
-  %3 = extractelement <2 x float> %2, i32 0
-  %4 = fsub fast float 1.000000e+00, %3
-  %5 = extractelement <2 x float> %2, i32 1
-  %6 = fsub fast float %4, %5
-  %7 = insertelement <4 x float> undef, float %6, i64 0
-  %8 = insertelement <4 x float> %7, float %3, i64 1
-  %9 = insertelement <4 x float> %8, float %5, i64 2
-  %10 = insertelement <4 x float> %9, float 1.000000e+00, i64 3
-  %11 = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %payload, i32 0, i32 0
-  store <4 x float> %10, <4 x float>* %11, align 4
-  ret void
-}
-
-; Function Attrs: nounwind
-define void @"\01?MyAnyHitShader@@YAXURayPayload@@UBuiltInTriangleIntersectionAttributes@@@Z"(%struct.RayPayload* noalias nocapture %payload, %struct.BuiltInTriangleIntersectionAttributes* nocapture readnone %attr) #1 !pointeetys !45 {
-  %1 = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %payload, i32 0, i32 0
-  %2 = load <4 x float>, <4 x float>* %1, align 4
-  %3 = call float @dx.op.objectRayOrigin.f32(i32 149, i8 0)
-  %4 = call float @dx.op.objectRayDirection.f32(i32 150, i8 0)
-  %5 = call float @dx.op.rayTCurrent.f32(i32 154)
-  %6 = fmul fast float %5, %4
-  %7 = fadd fast float %6, %3
-  %8 = fcmp fast ogt float %7, 0.000000e+00
-  br i1 %8, label %9, label %10
-
-9:                                                ; preds = %0
-  store <4 x float> %2, <4 x float>* %1, align 4
-  call void @dx.op.acceptHitAndEndSearch(i32 156)
-  unreachable
-
-10:                                               ; preds = %0
-  store <4 x float> %2, <4 x float>* %1, align 4
-  ret void
-}
-
-; Function Attrs: nounwind
-define void @"\01?MyIntersectionShader@@YAXXZ"() #1 {
-  %1 = alloca %struct.BuiltInTriangleIntersectionAttributes, align 4
-  %2 = call float @dx.op.rayTCurrent.f32(i32 154)
-  %3 = bitcast %struct.BuiltInTriangleIntersectionAttributes* %1 to i8*
-  call void @llvm.lifetime.start.p0i8(i64 8, i8* %3) #2
-  %4 = call i1 @dx.op.reportHit.struct.BuiltInTriangleIntersectionAttributes(i32 158, float %2, i32 0, %struct.BuiltInTriangleIntersectionAttributes* nonnull %1)
-  call void @llvm.lifetime.end.p0i8(i64 8, i8* %3) #2
-  ret void
-}
-
-; Function Attrs: nounwind
-define void @"\01?MyMissShader@@YAXURayPayload@@@Z"(%struct.RayPayload* noalias nocapture %payload) #1 !pointeetys !48 {
-  %1 = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %payload, i32 0, i32 0
-  store <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, <4 x float>* %1, align 4
-  ret void
-}
-
-; Function Attrs: nounwind
-declare !pointeetys !49 void @dx.op.traceRay.struct.RayPayload(i32, %dx.types.Handle, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, %struct.RayPayload*) #2
-
-; Function Attrs: nounwind
-declare void @dx.op.textureStore.f32(i32, %dx.types.Handle, i32, i32, i32, float, float, float, float, i8) #2
-
-; Function Attrs: nounwind memory(none)
-declare i32 @dx.op.dispatchRaysIndex.i32(i32, i8) #3
-
-; Function Attrs: nounwind memory(none)
-declare float @dx.op.objectRayDirection.f32(i32, i8) #3
-
-; Function Attrs: nounwind memory(none)
-declare float @dx.op.objectRayOrigin.f32(i32, i8) #3
-
-; Function Attrs: nounwind memory(read)
-declare float @dx.op.rayTCurrent.f32(i32) #4
-
-declare void @dx.op.acceptHitAndEndSearch(i32) #0
-
-; Function Attrs: nounwind
-declare !pointeetys !50 i1 @dx.op.reportHit.struct.BuiltInTriangleIntersectionAttributes(i32, float, i32, %struct.BuiltInTriangleIntersectionAttributes*) #2
-
-; Function Attrs: nounwind memory(none)
-declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #3
-
-; Function Attrs: nounwind memory(read)
-declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.types.Handle) #4
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare !pointeetys !51 void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #5
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare !pointeetys !51 void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #5
-
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
-attributes #3 = { nounwind memory(none) }
-attributes #4 = { nounwind memory(read) }
-attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
-
-!llvm.ident = !{!0}
-!dx.version = !{!1}
-!dx.valver = !{!1}
-!dx.shaderModel = !{!2}
-!dx.resources = !{!3}
-!dx.typeAnnotations = !{!10}
-!dx.entryPoints = !{!18, !20, !23, !25, !27, !29}
-
-!0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"}
-!1 = !{i32 1, i32 6}
-!2 = !{!"lib", i32 6, i32 6}
-!3 = !{!4, !7, null, null}
-!4 = !{!5}
-!5 = !{i32 0, %struct.RaytracingAccelerationStructure* bitcast (%dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A" to %struct.RaytracingAccelerationStructure*), !"Scene", i32 0, i32 0, i32 1, i32 16, i32 0, !6}
-!6 = !{i32 0, i32 4}
-!7 = !{!8}
-!8 = !{i32 0, %"class.RWTexture2D<vector<float, 4> >"* bitcast (%dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" to %"class.RWTexture2D<vector<float, 4> >"*), !"RenderTarget", i32 0, i32 0, i32 1, i32 2, i1 false, i1 false, i1 false, !9}
-!9 = !{i32 0, i32 9}
-!10 = !{i32 1, void ()* @"\01?MyRaygenShader@@YAXXZ", !11, void (%struct.RayPayload*, %struct.BuiltInTriangleIntersectionAttributes*)* @"\01?MyClosestHitShader@@YAXURayPayload@@UBuiltInTriangleIntersectionAttributes@@@Z", !14, void (%struct.RayPayload*, %struct.BuiltInTriangleIntersectionAttributes*)* @"\01?MyAnyHitShader@@YAXURayPayload@@UBuiltInTriangleIntersectionAttributes@@@Z", !14, void ()* @"\01?MyIntersectionShader@@YAXXZ", !11, void (%struct.RayPayload*)* @"\01?MyMissShader@@YAXURayPayload@@@Z", !17}
-!11 = !{!12}
-!12 = !{i32 1, !13, !13}
-!13 = !{}
-!14 = !{!12, !15, !16}
-!15 = !{i32 2, !13, !13}
-!16 = !{i32 0, !13, !13}
-!17 = !{!12, !15}
-!18 = !{null, !"", null, !3, !19}
-!19 = !{i32 0, i64 65536}
-!20 = !{void (%struct.RayPayload*, %struct.BuiltInTriangleIntersectionAttributes*)* @"\01?MyAnyHitShader@@YAXURayPayload@@UBuiltInTriangleIntersectionAttributes@@@Z", !"\01?MyAnyHitShader@@YAXURayPayload@@UBuiltInTriangleIntersectionAttributes@@@Z", null, null, !21}
-!21 = !{i32 8, i32 9, i32 6, i32 16, i32 7, i32 8, i32 5, !22}
-!22 = !{i32 0}
-!23 = !{void (%struct.RayPayload*, %struct.BuiltInTriangleIntersectionAttributes*)* @"\01?MyClosestHitShader@@YAXURayPayload@@UBuiltInTriangleIntersectionAttributes@@@Z", !"\01?MyClosestHitShader@@YAXURayPayload@@UBuiltInTriangleIntersectionAttributes@@@Z", null, null, !24}
-!24 = !{i32 8, i32 10, i32 6, i32 16, i32 7, i32 8, i32 5, !22}
-!25 = !{void ()* @"\01?MyIntersectionShader@@YAXXZ", !"\01?MyIntersectionShader@@YAXXZ", null, null, !26}
-!26 = !{i32 8, i32 8, i32 5, !22}
-!27 = !{void (%struct.RayPayload*)* @"\01?MyMissShader@@YAXURayPayload@@@Z", !"\01?MyMissShader@@YAXURayPayload@@@Z", null, null, !28}
-!28 = !{i32 8, i32 11, i32 6, i32 16, i32 5, !22}
-!29 = !{void ()* @"\01?MyRaygenShader@@YAXXZ", !"\01?MyRaygenShader@@YAXXZ", null, null, !30}
-!30 = !{i32 8, i32 7, i32 5, !22}
-!31 = !{%struct.TraversalData poison}
-!32 = !{i32 0, %struct.TraversalData poison}
-!33 = !{%struct.DispatchSystemData poison}
-!34 = !{i32 0, %struct.DispatchSystemData poison}
-!35 = !{%struct.SystemData poison}
-!36 = !{i32 0, %struct.SystemData poison}
-!37 = !{%struct.SystemData poison}
-!38 = !{%struct.TraversalData poison}
-!39 = !{%struct.SystemData poison}
-!40 = !{%struct.DispatchSystemData poison}
-!41 = !{%struct.TraversalData poison}
-!42 = !{!43, !43, i64 0}
-!43 = !{!"omnipotent char", !44, i64 0}
-!44 = !{!"Simple C/C++ TBAA"}
-!45 = !{null, %struct.RayPayload poison, %struct.BuiltInTriangleIntersectionAttributes poison}
-!46 = !{i32 0, %struct.RayPayload poison}
-!47 = !{i32 0, %struct.BuiltInTriangleIntersectionAttributes poison}
-!48 = !{%struct.RayPayload poison}
-!49 = !{%struct.RayPayload poison}
-!50 = !{%struct.BuiltInTriangleIntersectionAttributes poison}
-!51 = !{i8 poison}
diff --git a/llvmraytracing/test/intrinsics/discard-values.ll b/llvmraytracing/test/intrinsics/discard-values.ll
index f238ebf6c1..e0e7c3810b 100644
--- a/llvmraytracing/test/intrinsics/discard-values.ll
+++ b/llvmraytracing/test/intrinsics/discard-values.ll
@@ -8,8 +8,6 @@ declare float @_AmdGetUninitializedF32()
 declare i32 @_AmdGetUninitializedI32()
 declare %struct.AnyHitData @_AmdGetUninitializedStruct()
 
-declare !pointeetys !10 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
-
 define float @discard_f32() {
 ; CHECK-LABEL: define float @discard_f32() {
 ; CHECK-NEXT:  entry:
diff --git a/llvmraytracing/test/intrinsics/get-func-addr-not-found.ll b/llvmraytracing/test/intrinsics/get-func-addr-not-found.ll
index 77814e93e6..2a935355e9 100644
--- a/llvmraytracing/test/intrinsics/get-func-addr-not-found.ll
+++ b/llvmraytracing/test/intrinsics/get-func-addr-not-found.ll
@@ -8,10 +8,10 @@
 
 declare i32 @_AmdGetFuncAddr()
 
-declare !pointeetys !8 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
-declare !pointeetys !8 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
 declare !pointeetys !11 i1 @_cont_ReportHit(%struct.TraversalData* %data, float %t, i32 %hitKind)
 
+declare !pointeetys !{%struct.DispatchSystemData poison} void @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
+
 define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !pointeetys !8 {
   ret void
 }
diff --git a/llvmraytracing/test/intrinsics/get-func-addr.ll b/llvmraytracing/test/intrinsics/get-func-addr.ll
index 9143a3b85b..dbf69737d1 100644
--- a/llvmraytracing/test/intrinsics/get-func-addr.ll
+++ b/llvmraytracing/test/intrinsics/get-func-addr.ll
@@ -7,22 +7,22 @@ declare i32 @_AmdGetFuncAddrMyFunc()
 
 %struct.TraversalData = type { }
 
-declare !pointeetys !8 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
-declare !pointeetys !8 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
 declare !pointeetys !12 i1 @_cont_ReportHit(%struct.TraversalData* %data, float %t, i32 %hitKind)
 
+declare !pointeetys !{%struct.DispatchSystemData poison} void @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
+
 define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !pointeetys !8 {
   ret void
 }
 
 define { i32, i32 } @main() !lgc.rt.shaderstage !10 {
 ; CHECK-LABEL: define void @main
-; CHECK-SAME: (i32 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META5:![0-9]+]] !continuation.entry [[META10:![0-9]+]] !continuation.registercount [[META5]] !continuation [[META11:![0-9]+]] {
+; CHECK-SAME: (i32 [[SHADERINDEX:%.*]], i32 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META5:![0-9]+]] !continuation.entry [[META10:![0-9]+]] !continuation.registercount [[META5]] !continuation [[META11:![0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; CHECK-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [0 x i32], align 4
 ; CHECK-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
+; CHECK-NEXT:    call void @lgc.ilcps.setLocalRootIndex(i32 0)
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @MyFunc)
 ; CHECK-NEXT:    [[V0:%.*]] = insertvalue { i32, i32 } undef, i32 [[TMP1]], 0
 ; CHECK-NEXT:    call void @lgc.cps.complete()
diff --git a/llvmraytracing/test/intrinsics/shader-start.ll b/llvmraytracing/test/intrinsics/shader-start.ll
index c4bdade5f8..d7ecfc941f 100644
--- a/llvmraytracing/test/intrinsics/shader-start.ll
+++ b/llvmraytracing/test/intrinsics/shader-start.ll
@@ -6,23 +6,23 @@
 %struct.HitData = type { float, i32 }
 %struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
 
-declare !pointeetys !8 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
-declare !pointeetys !8 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
 declare !pointeetys !13 i1 @_cont_ReportHit(%struct.DispatchSystemData* %data, float %t, i32 %hitKind)
 declare !pointeetys !15 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData*) #0
 
+declare !pointeetys !{%struct.DispatchSystemData poison} void @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
+
 define void @main() !lgc.rt.shaderstage !10 {
 ; CHECK-LABEL: define void @main(
-; CHECK-SAME: i32 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META5:![0-9]+]] !continuation.registercount [[META0:![0-9]+]] !continuation [[META6:![0-9]+]] {
+; CHECK-SAME: i32 [[SHADERINDEX:%.*]], i32 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META5:![0-9]+]] !continuation.registercount [[META0:![0-9]+]] !continuation [[META6:![0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; CHECK-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [30 x i32], align 4
 ; CHECK-NEXT:    store [30 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; CHECK-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[SYSTEM_DATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; CHECK-NEXT:    store i32 123, ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], [8 x i32] poison, [30 x i32] [[TMP2]]), !continuation.registercount [[META0]]
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, i32 poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], [8 x i32] poison, [30 x i32] [[TMP2]]), !continuation.registercount [[META0]]
 ; CHECK-NEXT:    unreachable
 ;
 entry:
@@ -31,7 +31,7 @@ entry:
 
 define void @_cont_ShaderStart(%struct.DispatchSystemData* %data) !pointeetys !11 {
 ; CHECK-LABEL: define void @_cont_ShaderStart(
-; CHECK-SAME: ptr [[DATA:%.*]]) !pointeetys [[META3:![0-9]+]] {
+; CHECK-SAME: ptr [[DATA:%.*]]) !pointeetys [[META4:![0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[DATA]], i32 0, i32 0
 ; CHECK-NEXT:    store i32 123, ptr [[TMP0]], align 4
@@ -61,7 +61,7 @@ entry:
 !15 = !{%struct.SystemData poison}
 ;.
 ; CHECK: [[META0]] = !{i32 30}
-; CHECK: [[META3]] = !{%struct.DispatchSystemData poison}
+; CHECK: [[META4]] = !{%struct.DispatchSystemData poison}
 ; CHECK: [[META5]] = !{i32 1}
 ; CHECK: [[META6]] = !{ptr @main}
 ;.
diff --git a/llvmraytracing/test/lgccps/CpsLowering/continuation-basic.ll b/llvmraytracing/test/lgccps/CpsLowering/continuation-basic.ll
index 71cfff438d..f4e0c9d3ce 100644
--- a/llvmraytracing/test/lgccps/CpsLowering/continuation-basic.ll
+++ b/llvmraytracing/test/lgccps/CpsLowering/continuation-basic.ll
@@ -13,14 +13,14 @@ define void @test(i32 %arg, ptr %table) !lgc.cps !0 !lgc.shaderstage !{i32 7} !c
 ; CHECK-NEXT:    [[CR_THEN:%.*]] = load i32, ptr [[TABLE_0]], align 4
 ; CHECK-NEXT:    [[THEN_ARG:%.*]] = add i32 [[ARG]], 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR_THEN]], i32 2, i32 [[TMP0]], i32 [[THEN_ARG]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR_THEN]], i32 2, i32 [[TMP0]], i32 poison, i32 poison, i32 [[THEN_ARG]])
 ; CHECK-NEXT:    unreachable
 ;
 entry:
   %table.0 = getelementptr i32, ptr %table, i32 0
   %cr.then = load i32, ptr %table.0
   %then.arg = add i32 %arg, 1
-  call void (...) @lgc.cps.jump(i32 %cr.then, i32 2, i32 poison, i32 %then.arg)
+  call void (...) @lgc.cps.jump(i32 %cr.then, i32 2, i32 poison, i32 poison, i32 poison, i32 %then.arg)
   unreachable
 }
 !continuation.stackAddrspace = !{!1}
diff --git a/llvmraytracing/test/lgccps/CpsLowering/cps-entry-point.ll b/llvmraytracing/test/lgccps/CpsLowering/cps-entry-point.ll
index 929f97d738..04ec21efc4 100644
--- a/llvmraytracing/test/lgccps/CpsLowering/cps-entry-point.ll
+++ b/llvmraytracing/test/lgccps/CpsLowering/cps-entry-point.ll
@@ -13,7 +13,7 @@ define dllexport spir_func void @lgc.shader.CS.main() local_unnamed_addr #0 !lgc
 ; CHECK-NEXT:  [[_ENTRY:.*:]]
 ; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[DESC:%.*]] = call <4 x i32> @lgc.load.user.data__v4i32(i32 0)
-; CHECK-NEXT:    [[PTR:%.*]] = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> [[DESC]])
+; CHECK-NEXT:    [[PTR:%.*]] = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> [[DESC]], i1 false)
 ; CHECK-NEXT:    [[P0:%.*]] = getelementptr i32, ptr addrspace(7) [[PTR]], i32 0
 ; CHECK-NEXT:    [[I_VSP:%.*]] = load i32, ptr addrspace(7) [[P0]], align 4
 ; CHECK-NEXT:    store i32 [[I_VSP]], ptr [[CSP]], align 4
@@ -28,7 +28,7 @@ define dllexport spir_func void @lgc.shader.CS.main() local_unnamed_addr #0 !lgc
 ;
 .entry:
   %desc = call <4 x i32> @lgc.load.user.data__v4i32(i32 0)
-  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc, i1 false)
   %p0 = getelementptr i32, ptr addrspace(7) %ptr, i32 0
   %i_vsp = load i32, ptr addrspace(7) %p0, align 4
   %vsp = inttoptr i32 %i_vsp to ptr addrspace(32)
@@ -48,7 +48,7 @@ define dllexport spir_func void @lgc.shader.CS.main() local_unnamed_addr #0 !lgc
 
 declare <4 x i32> @lgc.load.user.data__v4i32(i32) #4
 
-declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>) #5
+declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>, i1) #5
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind willreturn memory(inaccessiblemem: write) }
diff --git a/llvmraytracing/test/lgccps/CpsLowering/cps-from-continufy.ll b/llvmraytracing/test/lgccps/CpsLowering/cps-from-continufy.ll
index 51e44a2df8..7ef8bd6050 100644
--- a/llvmraytracing/test/lgccps/CpsLowering/cps-from-continufy.ll
+++ b/llvmraytracing/test/lgccps/CpsLowering/cps-from-continufy.ll
@@ -31,14 +31,14 @@ define spir_func void @_rgen_1(i32 %rcr) #0 !spirv.ExecutionModel !15 !lgc.shade
 ; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP17]], i64 1
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i32> [[TMP18]], i32 -1, i64 2
 ; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i32> [[TMP19]], i32 553734060, i64 3
-; CHECK-NEXT:    [[TMP21:%.*]] = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> [[TMP20]])
+; CHECK-NEXT:    [[TMP21:%.*]] = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> [[TMP20]], i1 false)
 ; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @lgc.load.user.data__i32(i32 0)
 ; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP22]], i64 0
 ; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <2 x i32> [[TMP23]] to i64
 ; CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr addrspace(4)
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP25]], i32 32
 ; CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP26]], align 16
-; CHECK-NEXT:    [[TMP28:%.*]] = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> [[TMP27]])
+; CHECK-NEXT:    [[TMP28:%.*]] = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> [[TMP27]], i1 false)
 ; CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i32 [[TMP0]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP29]], i32 0
 ; CHECK-NEXT:    store ptr addrspace(7) [[TMP28]], ptr addrspace(5) [[TMP30]], align 32
@@ -48,7 +48,7 @@ define spir_func void @_rgen_1(i32 %rcr) #0 !spirv.ExecutionModel !15 !lgc.shade
 ; CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr addrspace(4)
 ; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP34]], i32 48
 ; CHECK-NEXT:    [[TMP36:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP35]], align 16
-; CHECK-NEXT:    [[TMP37:%.*]] = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> [[TMP36]])
+; CHECK-NEXT:    [[TMP37:%.*]] = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> [[TMP36]], i1 false)
 ; CHECK-NEXT:    [[TMP38:%.*]] = add i32 [[TMP0]], 8
 ; CHECK-NEXT:    [[TMP39:%.*]] = inttoptr i32 [[TMP38]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP39]], i32 0
@@ -74,7 +74,7 @@ define spir_func void @_rgen_1(i32 %rcr) #0 !spirv.ExecutionModel !15 !lgc.shade
 ; CHECK-NEXT:    [[TMP58:%.*]] = inttoptr i32 [[TMP57]] to ptr
 ; CHECK-NEXT:    [[TMP59:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @_rgen_1.resume.0)
 ; CHECK-NEXT:    [[TMP60:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[TMP57]], i32 2, i32 [[TMP60]], i32 [[TMP59]], [1 x i32] undef, i32 [[TMP45]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[TMP57]], i32 2, i32 [[TMP60]], i32 poison, i32 [[TMP59]], [1 x i32] undef, i32 [[TMP45]])
 ; CHECK-NEXT:    unreachable
 ;
 .entry:
@@ -98,14 +98,14 @@ define spir_func void @_rgen_1(i32 %rcr) #0 !spirv.ExecutionModel !15 !lgc.shade
   %17 = insertelement <4 x i32> %15, i32 %16, i64 1
   %18 = insertelement <4 x i32> %17, i32 -1, i64 2
   %19 = insertelement <4 x i32> %18, i32 553734060, i64 3
-  %20 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %19)
+  %20 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %19, i1 false)
   %21 = call i32 @lgc.load.user.data__i32(i32 0)
   %22 = insertelement <2 x i32> %4, i32 %21, i64 0
   %23 = bitcast <2 x i32> %22 to i64
   %24 = inttoptr i64 %23 to ptr addrspace(4)
   %25 = getelementptr i8, ptr addrspace(4) %24, i32 32
   %26 = load <4 x i32>, ptr addrspace(4) %25, align 16
-  %27 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %26)
+  %27 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %26, i1 false)
   %28 = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %0, i32 0, i32 0
   store ptr addrspace(7) %27, ptr addrspace(32) %28, align 32
   %29 = call i32 @lgc.load.user.data__i32(i32 0)
@@ -114,7 +114,7 @@ define spir_func void @_rgen_1(i32 %rcr) #0 !spirv.ExecutionModel !15 !lgc.shade
   %32 = inttoptr i64 %31 to ptr addrspace(4)
   %33 = getelementptr i8, ptr addrspace(4) %32, i32 48
   %34 = load <4 x i32>, ptr addrspace(4) %33, align 16
-  %35 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %34)
+  %35 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %34, i1 false)
   %36 = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %0, i32 0, i32 1
   store ptr addrspace(7) %35, ptr addrspace(32) %36, align 32
   %37 = load volatile i32, ptr addrspace(7) %35, align 4
@@ -135,7 +135,7 @@ define spir_func void @_rgen_1(i32 %rcr) #0 !spirv.ExecutionModel !15 !lgc.shade
   %51 = or i32 %50, 1
   %52 = inttoptr i32 %51 to ptr
   %53 = call i32 (...) @lgc.cps.as.continuation.reference(ptr @_rgen_1.resume.0)
-  call void (...) @lgc.cps.jump(i32 %51, i32 2, i32 poison, i32 %53, [1 x i32] undef, i32 %39)
+  call void (...) @lgc.cps.jump(i32 %51, i32 2, i32 poison, i32 poison, i32 %53, [1 x i32] undef, i32 %39)
   unreachable
 }
 
@@ -193,7 +193,7 @@ declare i32 @lgc.load.user.data__i32(i32) #1
 
 declare i64 @llvm.amdgcn.s.getpc() #2
 
-declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>) #1
+declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>, i1) #1
 
 declare ptr addrspace(32) @lgc.cps.alloc(i32) #6
 
diff --git a/llvmraytracing/test/lgccps/CpsLowering/cps-stack-lowering-dxil-global.ll b/llvmraytracing/test/lgccps/CpsLowering/cps-stack-lowering-dxil-global.ll
deleted file mode 100644
index a14a7fe618..0000000000
--- a/llvmraytracing/test/lgccps/CpsLowering/cps-stack-lowering-dxil-global.ll
+++ /dev/null
@@ -1,244 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -o - -passes='cleanup-continuations' %s -S | FileCheck --check-prefixes=CHECK %s
-
-%_rgen_1.Frame = type { ptr addrspace(22), ptr addrspace(22), i32 }
-
-declare void @lgc.cps.jump(...) #0
-
-declare ptr addrspace(32) @lgc.cps.alloc(i32)
-
-declare void @lgc.cps.free(i32)
-
-declare i32 @lgc.cps.as.continuation.reference(ptr)
-
-declare ptr addrspace(32) @lgc.cps.peek(i32)
-
-declare ptr addrspace(32) @lgc.cps.get.vsp()
-
-declare i32 @lgc.cps.get.dummy.index(i32)
-
-declare void @lgc.cps.complete()
-
-declare i64 @_cont_GetContinuationStackGlobalMemBase()
-
-define { ptr, ptr } @test.0(ptr %0) !lgc.cps !1 !lgc.rt.shaderstage !2 !continuation !3 {
-; CHECK-LABEL: define void @test.0(
-; CHECK-SAME: ) !lgc.cps [[META1:![0-9]+]] !lgc.rt.shaderstage [[META2:![0-9]+]] !continuation [[META3:![0-9]+]] !continuation.state [[META4:![0-9]+]] {
-; CHECK-NEXT:  [[ALLOCASPILLBB:.*:]]
-; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase()
-; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr addrspace(22)
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 12
-; CHECK-NEXT:    store i32 [[TMP1]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP3]], i32 [[TMP0]]
-; CHECK-NEXT:    store i32 333, ptr addrspace(22) [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP3]], i32 [[TMP4]]
-; CHECK-NEXT:    store i32 111, ptr addrspace(22) [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP0]], 9
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP3]], i32 [[TMP7]]
-; CHECK-NEXT:    store i8 99, ptr addrspace(22) [[TMP8]], align 1
-; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP10]], i32 poison, i32 [[TMP7]], i32 [[TMP4]])
-; CHECK-NEXT:    unreachable
-;
-AllocaSpillBB:
-  %mem = call ptr addrspace(32) @lgc.cps.alloc(i32 10)
-  store i32 333, ptr addrspace(32) %mem, align 4
-  %p1 = getelementptr i32, ptr addrspace(32) %mem, i32 1
-  store i32 111, ptr addrspace(32) %p1, align 4
-  %p2 = getelementptr i8, ptr addrspace(32) %mem, i32 9
-  store i8 99, ptr addrspace(32) %p2, align 1
-  %q1 = ptrtoint ptr addrspace(32) %p1 to i32
-  %cr = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
-  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 poison, i32 poison, ptr addrspace(32) %p2, i32 %q1)
-  unreachable
-}
-
-define { ptr, ptr } @test.1(ptr addrspace(32) %p2, i32 %q1, ptr %0) !lgc.cps !1 !lgc.rt.shaderstage !2 !continuation !4 {
-; CHECK-LABEL: define void @test.1(
-; CHECK-SAME: i32 [[P2:%.*]], i32 [[Q1:%.*]]) !lgc.cps [[META1]] !lgc.rt.shaderstage [[META2]] !continuation [[META5:![0-9]+]] !continuation.state [[META4]] {
-; CHECK-NEXT:  [[ALLOCASPILLBB:.*:]]
-; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase()
-; CHECK-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(22)
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[Q1]]
-; CHECK-NEXT:    [[N111:%.*]] = load i32, ptr addrspace(22) [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[P2]]
-; CHECK-NEXT:    [[N99:%.*]] = load i8, ptr addrspace(22) [[TMP3]], align 1
-; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @test.2)
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP4]], i32 poison)
-; CHECK-NEXT:    unreachable
-;
-AllocaSpillBB:
-  %p1 = inttoptr i32 %q1 to ptr addrspace(32)
-  %n111 = load i32, ptr addrspace(32) %p1, align 4
-  %n99 = load i8, ptr addrspace(32) %p2, align 1
-  %cr = call i32 @lgc.cps.as.continuation.reference(ptr @test.2)
-  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 poison, i32 poison)
-  unreachable
-}
-
-define { ptr, ptr } @test.2(ptr %0) !lgc.cps !1 !lgc.rt.shaderstage !2 !continuation !5 {
-; CHECK-LABEL: define void @test.2(
-; CHECK-SAME: ) !lgc.cps [[META1]] !lgc.rt.shaderstage [[META2]] !continuation [[META6:![0-9]+]] !continuation.state [[META4]] {
-; CHECK-NEXT:  [[ALLOCASPILLBB:.*:]]
-; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase()
-; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr addrspace(22)
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], -12
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP3]], i32 [[TMP1]]
-; CHECK-NEXT:    [[N333:%.*]] = load i32, ptr addrspace(22) [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], -12
-; CHECK-NEXT:    store i32 [[TMP5]], ptr [[CSP]], align 4
-; CHECK-NEXT:    ret void
-;
-AllocaSpillBB:
-  %mem = call ptr addrspace(32) @lgc.cps.peek(i32 10)
-  %n333 = load i32, ptr addrspace(32) %mem, align 4
-  call void @lgc.cps.free(i32 10)
-  call void @lgc.cps.complete()
-  unreachable
-}
-
-define { ptr, ptr } @test.gep(ptr %0) !lgc.cps !1 !lgc.rt.shaderstage !2 !continuation !6 {
-; CHECK-LABEL: define void @test.gep(
-; CHECK-SAME: ) !lgc.cps [[META1]] !lgc.rt.shaderstage [[META2]] !continuation [[META7:![0-9]+]] !continuation.state [[META4]] {
-; CHECK-NEXT:  [[ALLOCASPILLBB:.*:]]
-; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase()
-; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr addrspace(22)
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 12
-; CHECK-NEXT:    store i32 [[TMP1]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[STACK_EL0:%.*]] = call i32 @lgc.cps.get.dummy.index(i32 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[STACK_EL0]], 24
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP0]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP6]], i32 [[TMP3]]
-; CHECK-NEXT:    store i32 [[TMP4]], ptr addrspace(22) [[TMP11]], align 4
-; CHECK-NEXT:    [[STACK_EL1:%.*]] = call i32 @lgc.cps.get.dummy.index(i32 1)
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[STACK_EL1]], 24
-; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP0]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], -4
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP6]], i32 [[TMP8]]
-; CHECK-NEXT:    store i32 [[TMP10]], ptr addrspace(22) [[TMP12]], align 4
-; CHECK-NEXT:    [[STACK_EL2:%.*]] = call i32 @lgc.cps.get.dummy.index(i32 2)
-; CHECK-NEXT:    [[STACK_EL2_DIV:%.*]] = sdiv i32 [[STACK_EL2]], 2
-; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP0]], 8
-; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[STACK_EL2_DIV]], 24
-; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP16]], -8
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP6]], i32 [[TMP15]]
-; CHECK-NEXT:    store i32 [[TMP17]], ptr addrspace(22) [[TMP18]], align 4
-; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
-; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP20]], i32 poison, i32 [[TMP17]], i32 [[TMP17]])
-; CHECK-NEXT:    unreachable
-;
-AllocaSpillBB:
-  %mem = call ptr addrspace(32) @lgc.cps.alloc(i32 10)
-  %stack.el0 = call i32 @lgc.cps.get.dummy.index(i32 0)
-  %1 = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %mem, i32 %stack.el0
-  %vsp = call ptr addrspace(32) @lgc.cps.get.vsp()
-  %vsp.i = ptrtoint ptr addrspace(32) %vsp to i32
-  store i32 %vsp.i, ptr addrspace(32) %1, align 4
-  %stack.el1 = call i32 @lgc.cps.get.dummy.index(i32 1)
-  %2 = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %mem, i32 %stack.el1
-  %vsp.2 = call ptr addrspace(32) @lgc.cps.peek(i32 4)
-  %vsp.2.i = ptrtoint ptr addrspace(32) %vsp.2 to i32
-  store i32 %vsp.2.i, ptr addrspace(32) %2, align 4
-  %stack.el2 = call i32 @lgc.cps.get.dummy.index(i32 2)
-  %stack.el2.div = sdiv i32 %stack.el2, 2
-  %3 = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %mem, i32 %stack.el2.div, i32 1
-  %vsp.3 = call ptr addrspace(32) @lgc.cps.peek(i32 8)
-  %vsp.3.i = ptrtoint ptr addrspace(32) %vsp.3 to i32
-  store i32 %vsp.3.i, ptr addrspace(32) %3, align 4
-  %cr = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
-  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 poison, i32 poison, ptr addrspace(32) %vsp.3, i32 %vsp.3.i)
-  unreachable
-}
-
-define { ptr, ptr } @test.nested.gep(ptr %0) !lgc.cps !1 !lgc.rt.shaderstage !2 !continuation !7 {
-; CHECK-LABEL: define void @test.nested.gep(
-; CHECK-SAME: ) !lgc.cps [[META1]] !lgc.rt.shaderstage [[META2]] !continuation [[META8:![0-9]+]] !continuation.state [[META4]] {
-; CHECK-NEXT:  [[ALLOCASPILLBB:.*:]]
-; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase()
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr addrspace(22)
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 12
-; CHECK-NEXT:    store i32 [[TMP1]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[STACK_EL0:%.*]] = call i32 @lgc.cps.get.dummy.index(i32 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[STACK_EL0]], 24
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP0]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 16
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP7]], i32 [[TMP4]]
-; CHECK-NEXT:    store i32 [[TMP5]], ptr addrspace(22) [[TMP9]], align 4
-; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP8]], i32 poison, i32 [[TMP5]], i32 [[TMP5]])
-; CHECK-NEXT:    unreachable
-;
-AllocaSpillBB:
-  %mem = call ptr addrspace(32) @lgc.cps.alloc(i32 10)
-  %stack.el0 = call i32 @lgc.cps.get.dummy.index(i32 0)
-  %gep.base = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %mem, i32 %stack.el0
-  %1 = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %gep.base, i32 0, i32 2
-  %vsp = call ptr addrspace(32) @lgc.cps.get.vsp()
-  %vsp.i = ptrtoint ptr addrspace(32) %vsp to i32
-  store i32 %vsp.i, ptr addrspace(32) %1, align 4
-  %cr = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
-  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 poison, i32 poison, ptr addrspace(32) %vsp, i32 %vsp.i)
-  unreachable
-}
-
-declare !continuation !3 { ptr, ptr } @continuation.prototype.test.0(ptr, i1)
-
-declare ptr @continuation.malloc(i32)
-
-declare void @continuation.free(ptr)
-
-declare token @llvm.coro.id.retcon(i32, i32, ptr, ptr, ptr, ptr) #1
-
-declare ptr @llvm.coro.begin(token, ptr writeonly) #1
-
-declare !continuation !4 { ptr, ptr } @continuation.prototype.test.1(ptr, i1)
-
-declare !continuation !5 { ptr, ptr } @continuation.prototype.test.2(ptr, i1)
-
-declare !continuation !6 { ptr, ptr } @continuation.prototype.test.gep(ptr, i1)
-
-declare !continuation !7 { ptr, ptr } @continuation.prototype.test.nested.gep(ptr, i1)
-
-attributes #0 = { noreturn }
-attributes #1 = { nounwind }
-
-!continuation.stackAddrspace = !{!0}
-
-!0 = !{i32 22}
-!1 = !{i32 1}
-!2 = !{i32 7}
-!3 = !{ptr @test.0}
-!4 = !{ptr @test.1}
-!5 = !{ptr @test.2}
-!6 = !{ptr @test.gep}
-!7 = !{ptr @test.nested.gep}
-;.
-; CHECK: [[META1]] = !{i32 1}
-; CHECK: [[META2]] = !{i32 7}
-; CHECK: [[META3]] = !{ptr @test.0}
-; CHECK: [[META4]] = !{i32 0}
-; CHECK: [[META5]] = !{ptr @test.1}
-; CHECK: [[META6]] = !{ptr @test.2}
-; CHECK: [[META7]] = !{ptr @test.gep}
-; CHECK: [[META8]] = !{ptr @test.nested.gep}
-;.
diff --git a/llvmraytracing/test/lgccps/CpsLowering/cps-stack-lowering-dxil-scratch.ll b/llvmraytracing/test/lgccps/CpsLowering/cps-stack-lowering-dxil-scratch.ll
deleted file mode 100644
index f8fe1ecdd8..0000000000
--- a/llvmraytracing/test/lgccps/CpsLowering/cps-stack-lowering-dxil-scratch.ll
+++ /dev/null
@@ -1,247 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -o - -passes='cleanup-continuations' %s -S | FileCheck --check-prefixes=CHECK %s
-
-%_rgen_1.Frame = type { ptr addrspace(21), ptr addrspace(21), i32 }
-
-declare void @lgc.cps.jump(...) #0
-
-declare ptr addrspace(32) @lgc.cps.alloc(i32)
-
-declare void @lgc.cps.free(i32)
-
-declare i32 @lgc.cps.as.continuation.reference(ptr)
-
-declare ptr addrspace(32) @lgc.cps.peek(i32)
-
-declare ptr addrspace(32) @lgc.cps.get.vsp()
-
-declare i32 @lgc.cps.get.dummy.index(i32)
-
-declare void @lgc.cps.complete()
-
-define { ptr, ptr } @test.0(ptr %0) !lgc.cps !1 !lgc.shaderstage !2 !continuation !3 {
-; CHECK-LABEL: define void @test.0(
-; CHECK-SAME: i32 [[CSPINIT:%.*]]) !lgc.cps [[META1:![0-9]+]] !lgc.shaderstage [[META2:![0-9]+]] !continuation [[META3:![0-9]+]] !continuation.state [[META4:![0-9]+]] {
-; CHECK-NEXT:  [[ALLOCASPILLBB:.*:]]
-; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 12
-; CHECK-NEXT:    store i32 [[TMP1]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[TMP0]] to ptr addrspace(21)
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP2]], i32 0
-; CHECK-NEXT:    store i32 333, ptr addrspace(21) [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(21)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP5]], i32 0
-; CHECK-NEXT:    store i32 111, ptr addrspace(21) [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP0]], 9
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i32 [[TMP7]] to ptr addrspace(21)
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP8]], i32 0
-; CHECK-NEXT:    store i8 99, ptr addrspace(21) [[TMP9]], align 1
-; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP10]], i32 poison, i32 [[TMP7]], i32 [[TMP4]])
-; CHECK-NEXT:    unreachable
-;
-AllocaSpillBB:
-  %mem = call ptr addrspace(32) @lgc.cps.alloc(i32 10)
-  store i32 333, ptr addrspace(32) %mem, align 4
-  %p1 = getelementptr i32, ptr addrspace(32) %mem, i32 1
-  store i32 111, ptr addrspace(32) %p1, align 4
-  %p2 = getelementptr i8, ptr addrspace(32) %mem, i32 9
-  store i8 99, ptr addrspace(32) %p2, align 1
-  %q1 = ptrtoint ptr addrspace(32) %p1 to i32
-  %cr = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
-  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 poison, i32 poison, ptr addrspace(32) %p2, i32 %q1)
-  unreachable
-}
-
-define { ptr, ptr } @test.1(ptr addrspace(32) %p2, i32 %q1, ptr %0) !lgc.cps !1 !lgc.shaderstage !2 !continuation !4 {
-; CHECK-LABEL: define void @test.1(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[P2:%.*]], i32 [[Q1:%.*]]) !lgc.cps [[META1]] !lgc.shaderstage [[META2]] !continuation [[META5:![0-9]+]] !continuation.state [[META4]] {
-; CHECK-NEXT:  [[ALLOCASPILLBB:.*:]]
-; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[Q1]] to ptr addrspace(21)
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP0]], i32 0
-; CHECK-NEXT:    [[N111:%.*]] = load i32, ptr addrspace(21) [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[P2]] to ptr addrspace(21)
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP2]], i32 0
-; CHECK-NEXT:    [[N99:%.*]] = load i8, ptr addrspace(21) [[TMP3]], align 1
-; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @test.2)
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP4]], i32 poison)
-; CHECK-NEXT:    unreachable
-;
-AllocaSpillBB:
-  %p1 = inttoptr i32 %q1 to ptr addrspace(32)
-  %n111 = load i32, ptr addrspace(32) %p1, align 4
-  %n99 = load i8, ptr addrspace(32) %p2, align 1
-  %cr = call i32 @lgc.cps.as.continuation.reference(ptr @test.2)
-  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 poison, i32 poison)
-  unreachable
-}
-
-define { ptr, ptr } @test.2(ptr %0) !lgc.cps !1 !lgc.shaderstage !2 !continuation !5 {
-; CHECK-LABEL: define void @test.2(
-; CHECK-SAME: i32 [[CSPINIT:%.*]]) !lgc.cps [[META1]] !lgc.shaderstage [[META2]] !continuation [[META6:![0-9]+]] !continuation.state [[META4]] {
-; CHECK-NEXT:  [[ALLOCASPILLBB:.*:]]
-; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], -12
-; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[TMP1]] to ptr addrspace(21)
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP2]], i32 0
-; CHECK-NEXT:    [[N333:%.*]] = load i32, ptr addrspace(21) [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], -12
-; CHECK-NEXT:    store i32 [[TMP5]], ptr [[CSP]], align 4
-; CHECK-NEXT:    ret void
-;
-AllocaSpillBB:
-  %mem = call ptr addrspace(32) @lgc.cps.peek(i32 10)
-  %n333 = load i32, ptr addrspace(32) %mem, align 4
-  call void @lgc.cps.free(i32 10)
-  call void @lgc.cps.complete()
-  unreachable
-}
-
-define { ptr, ptr } @test.gep(ptr %0) !lgc.cps !1 !lgc.shaderstage !2 !continuation !6 {
-; CHECK-LABEL: define void @test.gep(
-; CHECK-SAME: i32 [[CSPINIT:%.*]]) !lgc.cps [[META1]] !lgc.shaderstage [[META2]] !continuation [[META7:![0-9]+]] !continuation.state [[META4]] {
-; CHECK-NEXT:  [[ALLOCASPILLBB:.*:]]
-; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 12
-; CHECK-NEXT:    store i32 [[TMP1]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[STACK_EL0:%.*]] = call i32 @lgc.cps.get.dummy.index(i32 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[STACK_EL0]], 24
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP0]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(21)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP5]], i32 0
-; CHECK-NEXT:    store i32 [[TMP4]], ptr addrspace(21) [[TMP6]], align 4
-; CHECK-NEXT:    [[STACK_EL1:%.*]] = call i32 @lgc.cps.get.dummy.index(i32 1)
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[STACK_EL1]], 24
-; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP0]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], -4
-; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP8]] to ptr addrspace(21)
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP11]], i32 0
-; CHECK-NEXT:    store i32 [[TMP10]], ptr addrspace(21) [[TMP12]], align 4
-; CHECK-NEXT:    [[STACK_EL2:%.*]] = call i32 @lgc.cps.get.dummy.index(i32 2)
-; CHECK-NEXT:    [[STACK_EL2_DIV:%.*]] = sdiv i32 [[STACK_EL2]], 2
-; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP0]], 8
-; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[STACK_EL2_DIV]], 24
-; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP16]], -8
-; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i32 [[TMP15]] to ptr addrspace(21)
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP18]], i32 0
-; CHECK-NEXT:    store i32 [[TMP17]], ptr addrspace(21) [[TMP19]], align 4
-; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
-; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP20]], i32 poison, i32 [[TMP17]], i32 [[TMP17]])
-; CHECK-NEXT:    unreachable
-;
-AllocaSpillBB:
-  %mem = call ptr addrspace(32) @lgc.cps.alloc(i32 10)
-  %stack.el0 = call i32 @lgc.cps.get.dummy.index(i32 0)
-  %1 = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %mem, i32 %stack.el0
-  %vsp = call ptr addrspace(32) @lgc.cps.get.vsp()
-  %vsp.i = ptrtoint ptr addrspace(32) %vsp to i32
-  store i32 %vsp.i, ptr addrspace(32) %1, align 4
-  %stack.el1 = call i32 @lgc.cps.get.dummy.index(i32 1)
-  %2 = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %mem, i32 %stack.el1
-  %vsp.2 = call ptr addrspace(32) @lgc.cps.peek(i32 4)
-  %vsp.2.i = ptrtoint ptr addrspace(32) %vsp.2 to i32
-  store i32 %vsp.2.i, ptr addrspace(32) %2, align 4
-  %stack.el2 = call i32 @lgc.cps.get.dummy.index(i32 2)
-  %stack.el2.div = sdiv i32 %stack.el2, 2
-  %3 = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %mem, i32 %stack.el2.div, i32 1
-  %vsp.3 = call ptr addrspace(32) @lgc.cps.peek(i32 8)
-  %vsp.3.i = ptrtoint ptr addrspace(32) %vsp.3 to i32
-  store i32 %vsp.3.i, ptr addrspace(32) %3, align 4
-  %cr = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
-  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 poison, i32 poison, ptr addrspace(32) %vsp.3, i32 %vsp.3.i)
-  unreachable
-}
-
-define { ptr, ptr } @test.nested.gep(ptr %0) !lgc.cps !1 !lgc.shaderstage !2 !continuation !7 {
-; CHECK-LABEL: define void @test.nested.gep(
-; CHECK-SAME: i32 [[CSPINIT:%.*]]) !lgc.cps [[META1]] !lgc.shaderstage [[META2]] !continuation [[META8:![0-9]+]] !continuation.state [[META4]] {
-; CHECK-NEXT:  [[ALLOCASPILLBB:.*:]]
-; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 12
-; CHECK-NEXT:    store i32 [[TMP1]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[STACK_EL0:%.*]] = call i32 @lgc.cps.get.dummy.index(i32 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[STACK_EL0]], 24
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP0]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 16
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(21)
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP6]], i32 0
-; CHECK-NEXT:    store i32 [[TMP5]], ptr addrspace(21) [[TMP7]], align 4
-; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP8]], i32 poison, i32 [[TMP5]], i32 [[TMP5]])
-; CHECK-NEXT:    unreachable
-;
-AllocaSpillBB:
-  %mem = call ptr addrspace(32) @lgc.cps.alloc(i32 10)
-  %stack.el0 = call i32 @lgc.cps.get.dummy.index(i32 0)
-  %gep.base = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %mem, i32 %stack.el0
-  %1 = getelementptr inbounds %_rgen_1.Frame, ptr addrspace(32) %gep.base, i32 0, i32 2
-  %vsp = call ptr addrspace(32) @lgc.cps.get.vsp()
-  %vsp.i = ptrtoint ptr addrspace(32) %vsp to i32
-  store i32 %vsp.i, ptr addrspace(32) %1, align 4
-  %cr = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
-  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 poison, i32 poison, ptr addrspace(32) %vsp, i32 %vsp.i)
-  unreachable
-}
-
-declare !continuation !3 { ptr, ptr } @continuation.prototype.test.0(ptr, i1)
-
-declare ptr @continuation.malloc(i32)
-
-declare void @continuation.free(ptr)
-
-declare token @llvm.coro.id.retcon(i32, i32, ptr, ptr, ptr, ptr) #1
-
-declare ptr @llvm.coro.begin(token, ptr writeonly) #1
-
-declare !continuation !4 { ptr, ptr } @continuation.prototype.test.1(ptr, i1)
-
-declare !continuation !5 { ptr, ptr } @continuation.prototype.test.2(ptr, i1)
-
-declare !continuation !6 { ptr, ptr } @continuation.prototype.test.gep(ptr, i1)
-
-declare !continuation !7 { ptr, ptr } @continuation.prototype.test.nested.gep(ptr, i1)
-
-attributes #0 = { noreturn }
-attributes #1 = { nounwind }
-
-!continuation.stackAddrspace = !{!0}
-
-!0 = !{i32 21}
-!1 = !{i32 1}
-!2 = !{i32 7}
-!3 = !{ptr @test.0}
-!4 = !{ptr @test.1}
-!5 = !{ptr @test.2}
-!6 = !{ptr @test.gep}
-!7 = !{ptr @test.nested.gep}
-;.
-; CHECK: [[META1]] = !{i32 1}
-; CHECK: [[META2]] = !{i32 7}
-; CHECK: [[META3]] = !{ptr @test.0}
-; CHECK: [[META4]] = !{i32 0}
-; CHECK: [[META5]] = !{ptr @test.1}
-; CHECK: [[META6]] = !{ptr @test.2}
-; CHECK: [[META7]] = !{ptr @test.gep}
-; CHECK: [[META8]] = !{ptr @test.nested.gep}
-;.
diff --git a/llvmraytracing/test/lgccps/CpsLowering/cps-stack-lowering.ll b/llvmraytracing/test/lgccps/CpsLowering/cps-stack-lowering.ll
index ae352d4b23..b46e141633 100644
--- a/llvmraytracing/test/lgccps/CpsLowering/cps-stack-lowering.ll
+++ b/llvmraytracing/test/lgccps/CpsLowering/cps-stack-lowering.ll
@@ -34,7 +34,7 @@ define void @test.0() !lgc.cps !{i32 1} !lgc.shaderstage !{i32 7} !continuation
 ; CHECK-NEXT:    store i8 99, ptr addrspace(5) [[TMP9]], align 1
 ; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP10]], i32 poison, i32 [[TMP7]], i32 [[TMP4]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP10]], i32 poison, i32 6, i32 [[TMP7]], i32 [[TMP4]])
 ; CHECK-NEXT:    unreachable
 ;
   %mem = call ptr addrspace(32) @lgc.cps.alloc(i32 10)   ; round up to 12 during lowering
@@ -50,7 +50,7 @@ define void @test.0() !lgc.cps !{i32 1} !lgc.shaderstage !{i32 7} !continuation
   %q1 = ptrtoint ptr addrspace(32) %p1 to i32
 
   %cr = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
-  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 poison, i32 poison, ptr addrspace(32) %p2, i32 %q1)
+  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 poison, i32 poison, i32 6, ptr addrspace(32) %p2, i32 %q1)
   unreachable
 }
 
@@ -68,7 +68,7 @@ define void @test.1(ptr addrspace(32) %p2, i32 %q1) !lgc.cps !{i32 1} !lgc.shade
 ; CHECK-NEXT:    [[N99:%.*]] = load i8, ptr addrspace(5) [[TMP3]], align 1
 ; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @test.2)
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP4]], i32 poison)
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP4]], i32 poison, i32 4)
 ; CHECK-NEXT:    unreachable
 ;
   %p1 = inttoptr i32 %q1 to ptr addrspace(32)
@@ -76,7 +76,7 @@ define void @test.1(ptr addrspace(32) %p2, i32 %q1) !lgc.cps !{i32 1} !lgc.shade
   %n99 = load i8, ptr addrspace(32) %p2
 
   %cr = call i32 @lgc.cps.as.continuation.reference(ptr @test.2)
-  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 poison, i32 poison)
+  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 poison, i32 poison, i32 4)
   unreachable
 }
 
@@ -143,7 +143,7 @@ define void @test.gep() !lgc.cps !{i32 1} !lgc.shaderstage !{i32 7} !continuatio
 ; CHECK-NEXT:    store i32 [[TMP17]], ptr addrspace(5) [[TMP19]], align 4
 ; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
 ; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP20]], i32 poison, i32 [[TMP17]], i32 [[TMP17]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP20]], i32 poison, i32 5, i32 [[TMP17]], i32 [[TMP17]])
 ; CHECK-NEXT:    unreachable
 ;
   %mem = call ptr addrspace(32) @lgc.cps.alloc(i32 10)   ; round up to 12 during lowering
@@ -168,7 +168,7 @@ define void @test.gep() !lgc.cps !{i32 1} !lgc.shaderstage !{i32 7} !continuatio
   store i32 %vsp.3.i, ptr addrspace(32) %3
 
   %cr = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
-  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 poison, i32 poison, ptr addrspace(32) %vsp.3, i32 %vsp.3.i)
+  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 poison, i32 poison, i32 5, ptr addrspace(32) %vsp.3, i32 %vsp.3.i)
   unreachable
 }
 
@@ -192,7 +192,7 @@ define void @test.nested.gep() !lgc.cps !{i32 1} !lgc.shaderstage !{i32 7} !cont
 ; CHECK-NEXT:    store i32 [[TMP5]], ptr addrspace(5) [[TMP7]], align 4
 ; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP8]], i32 poison, i32 [[TMP5]], i32 [[TMP5]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP8]], i32 poison, i32 3, i32 [[TMP5]], i32 [[TMP5]])
 ; CHECK-NEXT:    unreachable
 ;
   %mem = call ptr addrspace(32) @lgc.cps.alloc(i32 10)   ; round up to 12 during lowering
@@ -205,7 +205,7 @@ define void @test.nested.gep() !lgc.cps !{i32 1} !lgc.shaderstage !{i32 7} !cont
   store i32 %vsp.i, ptr addrspace(32) %1
 
   %cr = call i32 @lgc.cps.as.continuation.reference(ptr @test.1)
-  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 poison, i32 poison, ptr addrspace(32) %vsp, i32 %vsp.i)
+  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 poison, i32 poison, i32 3, ptr addrspace(32) %vsp, i32 %vsp.i)
   unreachable
 }
 
diff --git a/llvmraytracing/test/lgccps/alloca-select.ll b/llvmraytracing/test/lgccps/alloca-select.ll
index 5661fbc774..9bc5e8a0a7 100644
--- a/llvmraytracing/test/lgccps/alloca-select.ll
+++ b/llvmraytracing/test/lgccps/alloca-select.ll
@@ -5,7 +5,7 @@
 
 declare !lgc.cps !0 void @callee({}, i32, float)
 
-define void @test({} %state, i32 %rcr, float %arg, i32 %arg1) !lgc.cps !0 {
+define void @test(i32 %shaderIndex, i32 %rcr, float %arg, i32 %arg1) !lgc.cps !0 {
   %a1 = alloca i32
   %a2 = alloca i32
   %cond = icmp ult i32 %arg1, 0
@@ -13,12 +13,12 @@ define void @test({} %state, i32 %rcr, float %arg, i32 %arg1) !lgc.cps !0 {
   store i32 111, ptr %p, align 4
   %t0 = fadd float %arg, 1.0
   %cr = call i32 @lgc.cps.as.continuation.reference(ptr @callee)
-  %t1 = call { float } (...) @lgc.cps.await__f32(i32 %cr, i32 2, float %t0), !continuation.returnedRegistercount !{i32 0}
-  %res = extractvalue { float } %t1, 0
+  %t1 = call { i32, float } (...) @lgc.cps.await__sl_i32f32(i32 %cr, i32 2, i32 poison, float %t0), !continuation.returnedRegistercount !{i32 0}
+  %res = extractvalue { i32, float } %t1, 1
   %tmp = fmul float %res, %arg
   %v111 = load float, ptr %p, align 4
   %returnvalue = fmul float %tmp, %v111
-  call void (...) @lgc.cps.jump(i32 %rcr, i32 2,  i32 poison, i32 poison, float %returnvalue)
+  call void (...) @lgc.cps.jump(i32 %rcr, i32 2,  i32 poison, i32 poison, i32 poison, float %returnvalue)
   unreachable
 }
 
@@ -28,10 +28,10 @@ define void @test({} %state, i32 %rcr, float %arg, i32 %arg1) !lgc.cps !0 {
 !1 = !{i32 5}
 
 declare i32 @lgc.cps.as.continuation.reference(...) memory(none)
-declare { float } @lgc.cps.await__f32(...)
+declare { i32, float } @lgc.cps.await__sl_i32f32(...)
 declare void @lgc.cps.jump(...)
 ; CHECK-LABEL: define void @test
-; CHECK-SAME: (i32 [[CSPINIT:%.*]], {} [[STATE:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]], i32 [[ARG1:%.*]]) !lgc.cps [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] !continuation.stacksize [[META3:![0-9]+]] !continuation.state [[META3]] {
+; CHECK-SAME: (i32 [[CSPINIT:%.*]], i32 [[SHADERINDEX:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]], i32 [[ARG1:%.*]]) !lgc.cps [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] !continuation.stacksize [[META3:![0-9]+]] !continuation.state [[META3]] {
 ; CHECK-NEXT:  AllocaSpillBB:
 ; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -39,18 +39,18 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 20
 ; CHECK-NEXT:    store i32 [[TMP1]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP0]], 16
-; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(5)
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP4]], i32 0
-; CHECK-NEXT:    store i32 [[ARG1]], ptr addrspace(5) [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP0]], 12
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i32 [[TMP6]] to ptr addrspace(5)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP7]], i32 0
-; CHECK-NEXT:    store float [[ARG]], ptr addrspace(5) [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP0]], 8
-; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP9]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP0]], 12
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP6]], i32 0
+; CHECK-NEXT:    store i32 [[RCR]], ptr addrspace(5) [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP8]], i32 0
+; CHECK-NEXT:    store float [[ARG]], ptr addrspace(5) [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP5]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP10]], i32 0
-; CHECK-NEXT:    store i32 [[RCR]], ptr addrspace(5) [[TMP11]], align 4
+; CHECK-NEXT:    store i32 [[ARG1]], ptr addrspace(5) [[TMP11]], align 4
 ; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[ARG1]], 0
 ; CHECK-NEXT:    [[P_0:%.*]] = select i1 [[COND]], i32 [[TMP0]], i32 [[TMP2]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i32 [[P_0]] to ptr addrspace(5)
@@ -61,35 +61,36 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i32 [[CR]] to ptr
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @test.resume.0)
 ; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP16]], i32 [[TMP15]], float [[T0]]), !continuation.returnedRegistercount [[META4:![0-9]+]]
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP16]], i32 poison, i32 [[TMP15]], float [[T0]]), !continuation.returnedRegistercount [[META4:![0-9]+]]
 ; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK-LABEL: define dso_local void @test.resume.0
-; CHECK-SAME: (i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], float [[TMP2:%.*]]) !lgc.cps [[META1]] !continuation [[META2]] !continuation.registercount [[META4]] {
+; CHECK-SAME: (i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]], float [[TMP1:%.*]]) !lgc.cps [[META1]] !continuation [[META2]] !continuation.registercount [[META4]] {
 ; CHECK-NEXT:  entryresume.0:
 ; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], -20
-; CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { float } poison, float [[TMP2]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP4]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP4]], 16
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i32 [[TMP7]] to ptr addrspace(5)
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP8]], i32 0
-; CHECK-NEXT:    [[ARG1_RELOAD:%.*]] = load i32, ptr addrspace(5) [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP4]], 12
-; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP10]] to ptr addrspace(5)
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP11]], i32 0
-; CHECK-NEXT:    [[ARG_RELOAD:%.*]] = load float, ptr addrspace(5) [[TMP12]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP4]], 8
-; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i32 [[TMP13]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], -20
+; CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { i32, float } poison, i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { i32, float } [[TMP4]], float [[TMP1]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP3]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP3]], 8
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP3]], 12
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP3]], 16
+; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP10]], i32 0
+; CHECK-NEXT:    [[RELOAD_ROW0_RCR_:%.*]] = load i32, ptr addrspace(5) [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP12]], i32 0
+; CHECK-NEXT:    [[RELOAD_ROW1_ARG_:%.*]] = load float, ptr addrspace(5) [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i32 [[TMP9]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP14]], i32 0
-; CHECK-NEXT:    [[RCR_RELOAD:%.*]] = load i32, ptr addrspace(5) [[TMP15]], align 4
-; CHECK-NEXT:    [[COND2:%.*]] = icmp ult i32 [[ARG1_RELOAD]], 0
-; CHECK-NEXT:    [[P1_0:%.*]] = select i1 [[COND2]], i32 [[TMP4]], i32 [[TMP6]]
-; CHECK-NEXT:    [[RES:%.*]] = extractvalue { float } [[TMP5]], 0
-; CHECK-NEXT:    [[TMP:%.*]] = fmul float [[RES]], [[ARG_RELOAD]]
+; CHECK-NEXT:    [[RELOAD_ROW2_ARG1_:%.*]] = load i32, ptr addrspace(5) [[TMP15]], align 4
+; CHECK-NEXT:    [[COND2:%.*]] = icmp ult i32 [[RELOAD_ROW2_ARG1_]], 0
+; CHECK-NEXT:    [[P1_0:%.*]] = select i1 [[COND2]], i32 [[TMP3]], i32 [[TMP6]]
+; CHECK-NEXT:    [[RES:%.*]] = extractvalue { i32, float } [[TMP5]], 1
+; CHECK-NEXT:    [[TMP:%.*]] = fmul float [[RES]], [[RELOAD_ROW1_ARG_]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = inttoptr i32 [[P1_0]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP16]], i32 0
 ; CHECK-NEXT:    [[V111:%.*]] = load float, ptr addrspace(5) [[TMP17]], align 4
@@ -98,6 +99,6 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], -20
 ; CHECK-NEXT:    store i32 [[TMP19]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, i32 [[TMP20]], i32 poison, float [[RETURNVALUE]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RELOAD_ROW0_RCR_]], i32 2, i32 [[TMP20]], i32 poison, i32 poison, float [[RETURNVALUE]])
 ; CHECK-NEXT:    unreachable
 ;
diff --git a/llvmraytracing/test/lgccps/await-if-else.ll b/llvmraytracing/test/lgccps/await-if-else.ll
index 1a53da20d8..0a2c4de6cc 100644
--- a/llvmraytracing/test/lgccps/await-if-else.ll
+++ b/llvmraytracing/test/lgccps/await-if-else.ll
@@ -6,7 +6,7 @@
 declare !lgc.cps !0 void @callee({}, i32, float)
 declare !lgc.cps !0 void @callee2({}, i32, float)
 
-define void @test({} %state, i32 %rcr, float %arg) !lgc.cps !0 {
+define void @test(i32 %shaderIndex, i32 %rcr, float %arg) !lgc.cps !0 {
   %t0 = fadd float %arg, 1.0
   %cr = call i32 @lgc.cps.as.continuation.reference(ptr @callee)
   %cr2 = call i32 @lgc.cps.as.continuation.reference(ptr @callee2)
@@ -14,19 +14,19 @@ define void @test({} %state, i32 %rcr, float %arg) !lgc.cps !0 {
   br i1 %cond, label %bb1, label %bb2
 
 bb1:
-  %t1 = call { float } (...) @lgc.cps.await__f32(i32 %cr, i32 2, float %arg), !continuation.returnedRegistercount !{i32 0}
-  %res = extractvalue { float } %t1, 0
+  %t1 = call { i32, float } (...) @lgc.cps.await__sl_i32f32(i32 %cr, i32 2, i32 poison, float %arg), !continuation.returnedRegistercount !{i32 0}
+  %res.1 = extractvalue { i32, float } %t1, 1
   br label %bb3
 
 bb2:
-  %t2 = call { float } (...) @lgc.cps.await__f32(i32 %cr2, i32 2, float %t0), !continuation.returnedRegistercount !{i32 0}
-  %res.2 = extractvalue { float } %t2, 0
+  %t2 = call { i32, float } (...) @lgc.cps.await__sl_i32f32(i32 %cr2, i32 2, i32 poison, float %t0), !continuation.returnedRegistercount !{i32 0}
+  %res.2 = extractvalue { i32, float } %t2, 1
   br label %bb3
 
 bb3:
-  %t3 = phi float [%res, %bb1], [%res.2, %bb2]
+  %t3 = phi float [%res.1, %bb1], [%res.2, %bb2]
   %returnvalue = fmul float %t3, %arg
-  call void (...) @lgc.cps.jump(i32 %rcr, i32 2,  i32 poison, i32 poison, float %returnvalue)
+  call void (...) @lgc.cps.jump(i32 %rcr, i32 2,  i32 poison, i32 poison, i32 poison, float %returnvalue)
   unreachable
 }
 
@@ -36,10 +36,10 @@ bb3:
 !1 = !{i32 5}
 
 declare i32 @lgc.cps.as.continuation.reference(...) memory(none)
-declare { float } @lgc.cps.await__f32(...)
+declare { i32, float } @lgc.cps.await__sl_i32f32(...)
 declare void @lgc.cps.jump(...)
 ; CHECK-LABEL: define void @test(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], {} [[STATE:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]]) !lgc.cps [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] !continuation.stacksize [[META3:![0-9]+]] !continuation.state [[META3]] {
+; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[SHADERINDEX:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]]) !lgc.cps [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] !continuation.stacksize [[META3:![0-9]+]] !continuation.state [[META3]] {
 ; CHECK-NEXT:  AllocaSpillBB:
 ; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -47,12 +47,12 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], 8
 ; CHECK-NEXT:    store i32 [[TMP14]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP13]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP13]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP5]], i32 0
-; CHECK-NEXT:    store float [[ARG]], ptr addrspace(5) [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i32 [[TMP13]] to ptr addrspace(5)
+; CHECK-NEXT:    store i32 [[RCR]], ptr addrspace(5) [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP7]], i32 0
-; CHECK-NEXT:    store i32 [[RCR]], ptr addrspace(5) [[TMP8]], align 4
+; CHECK-NEXT:    store float [[ARG]], ptr addrspace(5) [[TMP8]], align 4
 ; CHECK-NEXT:    [[T0:%.*]] = fadd float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @callee)
 ; CHECK-NEXT:    [[CR2:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @callee2)
@@ -62,62 +62,64 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[CR]] to ptr
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @test.resume.0)
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP9]], i32 [[TMP1]], float [[ARG]]), !continuation.returnedRegistercount [[META4:![0-9]+]]
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP9]], i32 poison, i32 [[TMP1]], float [[ARG]]), !continuation.returnedRegistercount [[META4:![0-9]+]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[CR2]] to ptr
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @test.resume.1)
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR2]], i32 2, i32 [[TMP12]], i32 [[TMP3]], float [[T0]]), !continuation.returnedRegistercount [[META4]]
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR2]], i32 2, i32 [[TMP12]], i32 poison, i32 [[TMP3]], float [[T0]]), !continuation.returnedRegistercount [[META4]]
 ; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK-LABEL: define dso_local void @test.resume.0(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], float [[TMP2:%.*]]) !lgc.cps [[META1]] !continuation [[META2]] !continuation.registercount [[META4]] {
+; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]], float [[TMP1:%.*]]) !lgc.cps [[META1]] !continuation [[META2]] !continuation.registercount [[META4]] {
 ; CHECK-NEXT:  entryresume.0:
 ; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], -8
-; CHECK-NEXT:    [[TMP13:%.*]] = insertvalue { float } poison, float [[TMP2]], 0
-; CHECK-NEXT:    [[RES1:%.*]] = extractvalue { float } [[TMP13]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = insertvalue { i32, float } poison, i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue { i32, float } [[TMP13]], float [[TMP1]], 1
+; CHECK-NEXT:    [[RES_11:%.*]] = extractvalue { i32, float } [[TMP14]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i32 [[TMP5]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP6]], i32 0
-; CHECK-NEXT:    [[ARG_RELOAD:%.*]] = load float, ptr addrspace(5) [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(5)
+; CHECK-NEXT:    [[RCR_RELOAD:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i32 [[TMP5]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP8]], i32 0
-; CHECK-NEXT:    [[RCR_RELOAD:%.*]] = load i32, ptr addrspace(5) [[TMP9]], align 4
-; CHECK-NEXT:    [[RETURNVALUE:%.*]] = fmul float [[RES1]], [[ARG_RELOAD]]
+; CHECK-NEXT:    [[ARG_RELOAD:%.*]] = load float, ptr addrspace(5) [[TMP9]], align 4
+; CHECK-NEXT:    [[RETURNVALUE:%.*]] = fmul float [[RES_11]], [[ARG_RELOAD]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], -8
 ; CHECK-NEXT:    store i32 [[TMP11]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, i32 [[TMP12]], i32 poison, float [[RETURNVALUE]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, i32 [[TMP12]], i32 poison, i32 poison, float [[RETURNVALUE]])
 ; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK-LABEL: define dso_local void @test.resume.1(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], float [[TMP2:%.*]]) !lgc.cps [[META1]] !continuation [[META2]] !continuation.registercount [[META4]] {
+; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]], float [[TMP1:%.*]]) !lgc.cps [[META1]] !continuation [[META2]] !continuation.registercount [[META4]] {
 ; CHECK-NEXT:  entryresume.1:
 ; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], -8
-; CHECK-NEXT:    [[TMP13:%.*]] = insertvalue { float } poison, float [[TMP2]], 0
-; CHECK-NEXT:    [[RES_21:%.*]] = extractvalue { float } [[TMP13]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = insertvalue { i32, float } poison, i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue { i32, float } [[TMP13]], float [[TMP1]], 1
+; CHECK-NEXT:    [[RES_21:%.*]] = extractvalue { i32, float } [[TMP14]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i32 [[TMP5]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP6]], i32 0
-; CHECK-NEXT:    [[ARG_RELOAD:%.*]] = load float, ptr addrspace(5) [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(5)
+; CHECK-NEXT:    [[RCR_RELOAD:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i32 [[TMP5]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP8]], i32 0
-; CHECK-NEXT:    [[RCR_RELOAD:%.*]] = load i32, ptr addrspace(5) [[TMP9]], align 4
+; CHECK-NEXT:    [[ARG_RELOAD:%.*]] = load float, ptr addrspace(5) [[TMP9]], align 4
 ; CHECK-NEXT:    [[RETURNVALUE:%.*]] = fmul float [[RES_21]], [[ARG_RELOAD]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], -8
 ; CHECK-NEXT:    store i32 [[TMP11]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, i32 [[TMP12]], i32 poison, float [[RETURNVALUE]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, i32 [[TMP12]], i32 poison, i32 poison, float [[RETURNVALUE]])
 ; CHECK-NEXT:    unreachable
 ;
diff --git a/llvmraytracing/test/lgccps/await-if.ll b/llvmraytracing/test/lgccps/await-if.ll
index 275e1ba823..0452c94abb 100644
--- a/llvmraytracing/test/lgccps/await-if.ll
+++ b/llvmraytracing/test/lgccps/await-if.ll
@@ -5,7 +5,7 @@
 
 declare !lgc.cps !0 void @callee({}, i32, float)
 
-define void @test({} %state, i32 %rcr, float %arg) !lgc.cps !0 {
+define void @test(i32 %shaderIndex, i32 %rcr, float %arg) !lgc.cps !0 {
 entry:
   %t0 = fadd float %arg, 1.0
   %cr = call i32 @lgc.cps.as.continuation.reference(ptr @callee)
@@ -13,14 +13,14 @@ entry:
   br i1 %cond, label %bb1, label %bb2
 
 bb1:
-  %t1 = call { float } (...) @lgc.cps.await__f32(i32 %cr, i32 2, float %arg), !continuation.returnedRegistercount !{i32 0}
-  %res = extractvalue { float } %t1, 0
+  %t1 = call { i32, float } (...) @lgc.cps.await__sl_i32f32(i32 %cr, i32 2, i32 poison, float %arg), !continuation.returnedRegistercount !{i32 0}
+  %res = extractvalue { i32, float } %t1, 1
   br label %bb2
 
 bb2:
   %t3 = phi float [%res, %bb1], [%t0, %entry]
   %returnvalue = fmul float %t3, %arg
-  call void (...) @lgc.cps.jump(i32 %rcr, i32 2,  i32 poison, i32 poison, float %returnvalue)
+  call void (...) @lgc.cps.jump(i32 %rcr, i32 2, i32 poison, i32 poison, i32 poison, float %returnvalue)
   unreachable
 }
 
@@ -30,10 +30,10 @@ bb2:
 !1 = !{i32 5}
 
 declare i32 @lgc.cps.as.continuation.reference(...) memory(none)
-declare { float } @lgc.cps.await__f32(...)
+declare { i32, float } @lgc.cps.await__sl_i32f32(...)
 declare void @lgc.cps.jump(...)
 ; CHECK-LABEL: define void @test(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], {} [[STATE:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]]) !lgc.cps [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] !continuation.stacksize [[META3:![0-9]+]] !continuation.state [[META3]] {
+; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[SHADERINDEX:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]]) !lgc.cps [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] !continuation.stacksize [[META3:![0-9]+]] !continuation.state [[META3]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -41,12 +41,12 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 8
 ; CHECK-NEXT:    store i32 [[TMP3]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP2]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP5]], i32 0
-; CHECK-NEXT:    store float [[ARG]], ptr addrspace(5) [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(5)
+; CHECK-NEXT:    store i32 [[RCR]], ptr addrspace(5) [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP7]], i32 0
-; CHECK-NEXT:    store i32 [[RCR]], ptr addrspace(5) [[TMP8]], align 4
+; CHECK-NEXT:    store float [[ARG]], ptr addrspace(5) [[TMP8]], align 4
 ; CHECK-NEXT:    [[T0:%.*]] = fadd float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @callee)
 ; CHECK-NEXT:    [[COND:%.*]] = fcmp olt float [[T0]], 1.000000e+00
@@ -55,7 +55,7 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[CR]] to ptr
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @test.resume.0)
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP9]], i32 [[TMP1]], float [[ARG]]), !continuation.returnedRegistercount [[META4:![0-9]+]]
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP9]], i32 poison, i32 [[TMP1]], float [[ARG]]), !continuation.returnedRegistercount [[META4:![0-9]+]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[T0_BB2:%.*]] = phi float [ [[T0]], [[ENTRY:%.*]] ]
@@ -64,31 +64,32 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], -8
 ; CHECK-NEXT:    store i32 [[TMP11]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR]], i32 2, i32 [[TMP12]], i32 poison, float [[RETURNVALUE]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR]], i32 2, i32 [[TMP12]], i32 poison, i32 poison, float [[RETURNVALUE]])
 ; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK-LABEL: define dso_local void @test.resume.0(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], float [[TMP2:%.*]]) !lgc.cps [[META1]] !continuation [[META2]] !continuation.registercount [[META4]] {
+; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]], float [[TMP1:%.*]]) !lgc.cps [[META1]] !continuation [[META2]] !continuation.registercount [[META4]] {
 ; CHECK-NEXT:  entryresume.0:
 ; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], -8
-; CHECK-NEXT:    [[TMP13:%.*]] = insertvalue { float } poison, float [[TMP2]], 0
-; CHECK-NEXT:    [[RES1:%.*]] = extractvalue { float } [[TMP13]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = insertvalue { i32, float } poison, i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue { i32, float } [[TMP13]], float [[TMP1]], 1
+; CHECK-NEXT:    [[RES1:%.*]] = extractvalue { i32, float } [[TMP14]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i32 [[TMP5]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP6]], i32 0
-; CHECK-NEXT:    [[ARG_RELOAD:%.*]] = load float, ptr addrspace(5) [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(5)
+; CHECK-NEXT:    [[RCR_RELOAD:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i32 [[TMP5]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP8]], i32 0
-; CHECK-NEXT:    [[RCR_RELOAD:%.*]] = load i32, ptr addrspace(5) [[TMP9]], align 4
+; CHECK-NEXT:    [[ARG_RELOAD:%.*]] = load float, ptr addrspace(5) [[TMP9]], align 4
 ; CHECK-NEXT:    [[RETURNVALUE:%.*]] = fmul float [[RES1]], [[ARG_RELOAD]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], -8
 ; CHECK-NEXT:    store i32 [[TMP11]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, i32 [[TMP12]], i32 poison, float [[RETURNVALUE]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, i32 [[TMP12]], i32 poison, i32 poison, float [[RETURNVALUE]])
 ; CHECK-NEXT:    unreachable
 ;
diff --git a/llvmraytracing/test/lgccps/await-in-loop.ll b/llvmraytracing/test/lgccps/await-in-loop.ll
index 4f62171058..bdaf2d2f6c 100644
--- a/llvmraytracing/test/lgccps/await-in-loop.ll
+++ b/llvmraytracing/test/lgccps/await-in-loop.ll
@@ -5,7 +5,7 @@
 
 declare !lgc.cps !0 void @callee({}, i32, i32)
 
-define void @test({} %state, i32 %rcr, float %arg, float %arg2) !lgc.cps !0 {
+define void @test(i32 %shaderIndex, i32 %rcr, float %arg, float %arg2) !lgc.cps !0 {
 entry:
   %t0 = fadd float %arg, 1.0
   %cr = call i32 @lgc.cps.as.continuation.reference(ptr @callee)
@@ -13,16 +13,16 @@ entry:
 
 loop:
   %ind = phi i32 [0, %entry], [%inc, %loop]
-  %t1 = call { float } (...) @lgc.cps.await__f32(i32 %cr, i32 2, i32 %ind), !continuation.returnedRegistercount !{i32 0}
+  %t1 = call { i32, float } (...) @lgc.cps.await__sl_i32f32(i32 %cr, i32 2, i32 poison, i32 %ind), !continuation.returnedRegistercount !{i32 0}
   %inc = add i32 %ind, 1
-  %res = extractvalue { float } %t1, 0
+  %res = extractvalue { i32, float } %t1, 1
   %cond = fcmp olt float %res, 5.0
   br i1 %cond, label %loop, label %end
 
 end:
   %t2 = fmul float %res, %arg
   %returnvalue = fadd float %t2, %arg2
-  call void (...) @lgc.cps.jump(i32 %rcr, i32 2,  i32 poison, i32 poison, float %returnvalue)
+  call void (...) @lgc.cps.jump(i32 %rcr, i32 2,  i32 poison, i32 poison, i32 poison, float %returnvalue)
   unreachable
 }
 
@@ -32,30 +32,30 @@ end:
 !1 = !{i32 5}
 
 declare i32 @lgc.cps.as.continuation.reference(...) memory(none)
-declare { float } @lgc.cps.await__f32(...)
+declare { i32, float } @lgc.cps.await__sl_i32f32(...)
 declare void @lgc.cps.jump(...)
 ; CHECK-LABEL: define void @test(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], {} [[STATE:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]], float [[ARG2:%.*]]) !lgc.cps [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] !continuation.stacksize [[META3:![0-9]+]] !continuation.state [[META3]] {
+; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[SHADERINDEX:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]], float [[ARG2:%.*]]) !lgc.cps [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] !continuation.stacksize [[META3:![0-9]+]] !continuation.state [[META3]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 20
 ; CHECK-NEXT:    store i32 [[TMP3]], ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP2]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP2]], 8
-; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP2]], 12
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP5]], i32 0
-; CHECK-NEXT:    store float [[ARG2]], ptr addrspace(5) [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP2]], 4
+; CHECK-NEXT:    store i32 [[RCR]], ptr addrspace(5) [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i32 [[TMP7]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP8]], i32 0
 ; CHECK-NEXT:    store float [[ARG]], ptr addrspace(5) [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP10]], i32 0
-; CHECK-NEXT:    store i32 [[RCR]], ptr addrspace(5) [[TMP11]], align 4
+; CHECK-NEXT:    store float [[ARG2]], ptr addrspace(5) [[TMP11]], align 4
 ; CHECK-NEXT:    [[T0:%.*]] = fadd float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @callee)
-; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP2]], 12
 ; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i32 [[TMP12]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP13]], i32 0
 ; CHECK-NEXT:    store i32 [[CR]], ptr addrspace(5) [[TMP14]], align 4
@@ -66,59 +66,60 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[CR]] to ptr
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @test.resume.0)
 ; CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP18]], i32 [[TMP1]], i32 0), !continuation.returnedRegistercount [[META4:![0-9]+]]
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP18]], i32 poison, i32 [[TMP1]], i32 0), !continuation.returnedRegistercount [[META4:![0-9]+]]
 ; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK-LABEL: define dso_local void @test.resume.0(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], float [[TMP2:%.*]]) !lgc.cps [[META1]] !continuation [[META2]] !continuation.registercount [[META4]] {
+; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]], float [[TMP1:%.*]]) !lgc.cps [[META1]] !continuation [[META2]] !continuation.registercount [[META4]] {
 ; CHECK-NEXT:  entryresume.0:
 ; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP18]], -20
-; CHECK-NEXT:    [[TMP30:%.*]] = insertvalue { float } poison, float [[TMP2]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { i32, float } poison, i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP31:%.*]] = insertvalue { i32, float } [[TMP4]], float [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 16
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP8]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP9]], i32 0
 ; CHECK-NEXT:    [[IND_RELOAD:%.*]] = load i32, ptr addrspace(5) [[TMP10]], align 4
 ; CHECK-NEXT:    [[INC:%.*]] = add i32 [[IND_RELOAD]], 1
-; CHECK-NEXT:    [[RES1:%.*]] = extractvalue { float } [[TMP30]], 0
+; CHECK-NEXT:    [[RES1:%.*]] = extractvalue { i32, float } [[TMP31]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = fcmp olt float [[RES1]], 5.000000e+00
 ; CHECK-NEXT:    br i1 [[COND]], label [[LOOP_FROM_AFTERCOROSUSPEND:%.*]], label [[END:%.*]]
 ; CHECK:       loop.from.AfterCoroSuspend:
 ; CHECK-NEXT:    [[INC_LOOP:%.*]] = phi i32 [ [[INC]], [[ENTRYRESUME_0:%.*]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP7]], 12
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP7]], 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i32 [[TMP11]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP12]], i32 0
 ; CHECK-NEXT:    store i32 [[INC_LOOP]], ptr addrspace(5) [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP7]], 12
 ; CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i32 [[TMP14]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP15]], i32 0
 ; CHECK-NEXT:    [[CR_RELOAD:%.*]] = load i32, ptr addrspace(5) [[TMP16]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[CR_RELOAD]] to ptr
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @test.resume.0)
 ; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR_RELOAD]], i32 2, i32 [[TMP28]], i32 [[TMP6]], i32 [[INC_LOOP]]), !continuation.returnedRegistercount [[META4]]
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR_RELOAD]], i32 2, i32 [[TMP28]], i32 poison, i32 [[TMP6]], i32 [[INC_LOOP]]), !continuation.returnedRegistercount [[META4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       end:
+; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP7]], 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP7]], 8
-; CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i32 [[TMP17]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i32 [[TMP7]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP29]], i32 0
-; CHECK-NEXT:    [[ARG2_RELOAD:%.*]] = load float, ptr addrspace(5) [[TMP19]], align 4
-; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP7]], 4
+; CHECK-NEXT:    [[RCR_RELOAD:%.*]] = load i32, ptr addrspace(5) [[TMP19]], align 4
 ; CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i32 [[TMP20]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP21]], i32 0
 ; CHECK-NEXT:    [[ARG_RELOAD:%.*]] = load float, ptr addrspace(5) [[TMP22]], align 4
-; CHECK-NEXT:    [[TMP23:%.*]] = inttoptr i32 [[TMP7]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP23]], i32 0
-; CHECK-NEXT:    [[RCR_RELOAD:%.*]] = load i32, ptr addrspace(5) [[TMP24]], align 4
+; CHECK-NEXT:    [[ARG2_RELOAD:%.*]] = load float, ptr addrspace(5) [[TMP24]], align 4
 ; CHECK-NEXT:    [[T2:%.*]] = fmul float [[RES1]], [[ARG_RELOAD]]
 ; CHECK-NEXT:    [[RETURNVALUE:%.*]] = fadd float [[T2]], [[ARG2_RELOAD]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = load i32, ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP26:%.*]] = add i32 [[TMP25]], -20
 ; CHECK-NEXT:    store i32 [[TMP26]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, i32 [[TMP27]], i32 poison, float [[RETURNVALUE]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, i32 [[TMP27]], i32 poison, i32 poison, float [[RETURNVALUE]])
 ; CHECK-NEXT:    unreachable
 ;
diff --git a/llvmraytracing/test/lgccps/call-shader-i1-payload.ll b/llvmraytracing/test/lgccps/call-shader-i1-payload.ll
index 153909409a..6487fbd1a4 100644
--- a/llvmraytracing/test/lgccps/call-shader-i1-payload.ll
+++ b/llvmraytracing/test/lgccps/call-shader-i1-payload.ll
@@ -12,22 +12,22 @@
 %struct.AnyHitTraversalData = type { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } }
 
 ; Need _cont_ReportHit to get anyhit traversal system data type
-declare  !pointeetys !8 i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind)
+declare !pointeetys !8 i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind)
 
 ; Function Attrs: alwaysinline
-declare %struct.DispatchSystemData @_AmdAwaitShader(i32, %struct.DispatchSystemData) #0
+declare %struct.DispatchSystemData @_AmdAwaitShader(i32, i32, i32, %struct.DispatchSystemData) #0
 
 declare !pointeetys !1 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
 
 ; Function Attrs: alwaysinline
-define i32 @_cont_GetLocalRootIndex(ptr %data) #0 !pointeetys !1 {
-  ret i32 5
+define void @_cont_ExitRayGen(ptr %data) #0 !pointeetys !1 {
+  ret void
 }
 
 ; Function Attrs: alwaysinline
 define void @_cont_CallShader(ptr %data, i32 %0) #0 !pointeetys !2 {
   %dis_data = load %struct.DispatchSystemData, ptr %data, align 4
-  %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i32 2, %struct.DispatchSystemData %dis_data)
+  %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i32 2, i32 3, i32 poison, %struct.DispatchSystemData %dis_data)
   store %struct.DispatchSystemData %newdata, ptr %data, align 4
   ret void
 }
@@ -55,13 +55,8 @@ attributes #1 = { nounwind willreturn memory(argmem: readwrite, inaccessiblemem:
 !7 = !{i32 0, %struct.AnyHitTraversalData poison}
 !8 = !{!"function", i1 poison, !7, float poison, i32 poison}
 
-; LOWER-RAYTRACING-PIPELINE-LABEL: define i32 @_cont_GetLocalRootIndex(
-; LOWER-RAYTRACING-PIPELINE-SAME: ptr [[DATA:%.*]]) #[[ATTR0:[0-9]+]] !pointeetys [[META3:![0-9]+]] {
-; LOWER-RAYTRACING-PIPELINE-NEXT:    ret i32 5
-;
-;
 ; LOWER-RAYTRACING-PIPELINE-LABEL: define void @called(
-; LOWER-RAYTRACING-PIPELINE-SAME: i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [2 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META4:![0-9]+]] !lgc.cps [[META1:![0-9]+]] !continuation.registercount [[META1]] !continuation [[META5:![0-9]+]] {
+; LOWER-RAYTRACING-PIPELINE-SAME: i32 [[SHADERINDEX:%.*]], i32 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [2 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META4:![0-9]+]] !lgc.cps [[META1:![0-9]+]] !continuation.registercount [[META1]] !continuation [[META5:![0-9]+]] {
 ; LOWER-RAYTRACING-PIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; LOWER-RAYTRACING-PIPELINE-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [2 x i32], align 4
 ; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_MYPARAMS:%.*]], align 8
@@ -78,16 +73,16 @@ attributes #1 = { nounwind willreturn memory(argmem: readwrite, inaccessiblemem:
 ; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP1]], i32 0
 ; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
 ; LOWER-RAYTRACING-PIPELINE-NEXT:    store i32 [[TMP8]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 1
-; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
-; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP23]], align 4
-; LOWER-RAYTRACING-PIPELINE-NEXT:    store i32 [[TMP11]], ptr [[TMP10]], align 4
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 1
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
+; LOWER-RAYTRACING-PIPELINE-NEXT:    store i32 [[TMP11]], ptr [[TMP9]], align 4
 ; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP12:%.*]] = load [2 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP13:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [2 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa8i32a2i32s(i32 2, i32 4, i32 5, [9 x i32] poison, [2 x i32] [[TMP12]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
-; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP14:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [2 x i32] } [[TMP13]], 2
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP13:%.*]] = call { i32, i32, [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [2 x i32] } (...) @lgc.cps.await__sl_i32i32s_struct.DispatchSystemDatasa8i32a2i32s(i32 2, i32 4, i32 3, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [8 x i32] poison, [2 x i32] [[TMP12]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP14:%.*]] = extractvalue { i32, i32, [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [2 x i32] } [[TMP13]], 4
 ; LOWER-RAYTRACING-PIPELINE-NEXT:    store [2 x i32] [[TMP14]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP29:%.*]] = freeze [[STRUCT_MYPARAMS]] poison
-; LOWER-RAYTRACING-PIPELINE-NEXT:    store [[STRUCT_MYPARAMS]] [[TMP29]], ptr [[TMP1]], align 4
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP23:%.*]] = freeze [[STRUCT_MYPARAMS]] poison
+; LOWER-RAYTRACING-PIPELINE-NEXT:    store [[STRUCT_MYPARAMS]] [[TMP23]], ptr [[TMP1]], align 4
 ; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP1]], i32 0
 ; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP17:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWER-RAYTRACING-PIPELINE-NEXT:    store i32 [[TMP17]], ptr [[TMP16]], align 4
@@ -95,7 +90,7 @@ attributes #1 = { nounwind willreturn memory(argmem: readwrite, inaccessiblemem:
 ; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 1
 ; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
 ; LOWER-RAYTRACING-PIPELINE-NEXT:    store i32 [[TMP20]], ptr [[TMP18]], align 4
-; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP15:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [2 x i32] } [[TMP13]], 0
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP15:%.*]] = extractvalue { i32, i32, [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [2 x i32] } [[TMP13]], 2
 ; LOWER-RAYTRACING-PIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP15]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP1]], i32 0
 ; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
@@ -110,13 +105,8 @@ attributes #1 = { nounwind willreturn memory(argmem: readwrite, inaccessiblemem:
 ; LOWER-RAYTRACING-PIPELINE-NEXT:    unreachable
 ;
 ;
-; SROA-LABEL: define i32 @_cont_GetLocalRootIndex(
-; SROA-SAME: ptr [[DATA:%.*]]) #[[ATTR0:[0-9]+]] !pointeetys [[META3:![0-9]+]] {
-; SROA-NEXT:    ret i32 5
-;
-;
 ; SROA-LABEL: define void @called(
-; SROA-SAME: i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [2 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META4:![0-9]+]] !lgc.cps [[META1:![0-9]+]] !continuation.registercount [[META1]] !continuation [[META5:![0-9]+]] {
+; SROA-SAME: i32 [[SHADERINDEX:%.*]], i32 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [2 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META4:![0-9]+]] !lgc.cps [[META1:![0-9]+]] !continuation.registercount [[META1]] !continuation [[META5:![0-9]+]] {
 ; SROA-NEXT:    [[DOTSROA_5:%.*]] = alloca i8, align 4
 ; SROA-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i32] [[PAYLOAD]], 0
 ; SROA-NEXT:    [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i32] [[PAYLOAD]], 1
@@ -126,34 +116,9 @@ attributes #1 = { nounwind willreturn memory(argmem: readwrite, inaccessiblemem:
 ; SROA-NEXT:    [[SYSTEM_DATA_FCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[SYSTEM_DATA]], 0
 ; SROA-NEXT:    store i8 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_EXTRACT_TRUNC]], ptr [[DOTSROA_5]], align 4
 ; SROA-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[SYSTEM_DATA_FCA_0_EXTRACT]], 0
-; SROA-NEXT:    [[DOTSROA_5_0__SROA_5_4_2:%.*]] = load i8, ptr [[DOTSROA_5]], align 4
-; SROA-NEXT:    [[DOTFCA_0_INSERT5:%.*]] = insertvalue [2 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0
-; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_INSERT_EXT19:%.*]] = zext i24 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_EXTRACT_TRUNC]] to i32
-; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_INSERT_SHIFT20:%.*]] = shl i32 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_INSERT_EXT19]], 8
-; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_INSERT_MASK21:%.*]] = and i32 undef, 255
-; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_INSERT_INSERT22:%.*]] = or i32 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_INSERT_MASK21]], [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_INSERT_SHIFT20]]
-; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_INSERT_EXT15:%.*]] = zext i8 [[DOTSROA_5_0__SROA_5_4_2]] to i32
-; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_INSERT_MASK16:%.*]] = and i32 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_INSERT_INSERT22]], -256
-; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_INSERT_INSERT17:%.*]] = or i32 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_INSERT_MASK16]], [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_INSERT_EXT15]]
-; SROA-NEXT:    [[DOTFCA_1_INSERT8:%.*]] = insertvalue [2 x i32] [[DOTFCA_0_INSERT5]], i32 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_INSERT_INSERT17]], 1
-; SROA-NEXT:    [[TMP1:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [2 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa8i32a2i32s(i32 2, i32 4, i32 5, [9 x i32] poison, [2 x i32] [[DOTFCA_1_INSERT8]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
-; SROA-NEXT:    [[TMP2:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [2 x i32] } [[TMP1]], 2
-; SROA-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x i32] [[TMP2]], 0
-; SROA-NEXT:    [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x i32] [[TMP2]], 1
-; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_EXTRACT_TRUNC18:%.*]] = trunc i32 [[DOTFCA_1_EXTRACT]] to i8
-; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_EXTRACT_SHIFT23:%.*]] = lshr i32 [[DOTFCA_1_EXTRACT]], 8
-; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_EXTRACT_TRUNC24:%.*]] = trunc i32 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_EXTRACT_SHIFT23]] to i24
-; SROA-NEXT:    [[TMP4:%.*]] = freeze [[STRUCT_MYPARAMS:%.*]] poison
-; SROA-NEXT:    [[DOTFCA_0_EXTRACT1:%.*]] = extractvalue [[STRUCT_MYPARAMS]] [[TMP4]], 0
-; SROA-NEXT:    [[DOTFCA_1_EXTRACT1:%.*]] = extractvalue [[STRUCT_MYPARAMS]] [[TMP4]], 1
-; SROA-NEXT:    store i1 [[DOTFCA_1_EXTRACT1]], ptr [[DOTSROA_5]], align 4
-; SROA-NEXT:    store i8 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_EXTRACT_TRUNC18]], ptr [[DOTSROA_5]], align 4
-; SROA-NEXT:    [[TMP3:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [2 x i32] } [[TMP1]], 0
-; SROA-NEXT:    [[DOTFCA_0_EXTRACT27:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP3]], 0
 ; SROA-NEXT:    [[DOTSROA_5_0__SROA_5_4_:%.*]] = load i8, ptr [[DOTSROA_5]], align 4
-; SROA-NEXT:    [[DOTFCA_0_INSERT38:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT27]], 0
-; SROA-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x i32] poison, i32 [[DOTFCA_0_EXTRACT]], 0
-; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_INSERT_EXT:%.*]] = zext i24 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_EXTRACT_TRUNC24]] to i32
+; SROA-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0
+; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_INSERT_EXT:%.*]] = zext i24 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_EXTRACT_TRUNC]] to i32
 ; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_INSERT_SHIFT:%.*]] = shl i32 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_INSERT_EXT]], 8
 ; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_INSERT_MASK:%.*]] = and i32 undef, 255
 ; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_INSERT_INSERT:%.*]] = or i32 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_INSERT_MASK]], [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_INSERT_SHIFT]]
@@ -161,6 +126,31 @@ attributes #1 = { nounwind willreturn memory(argmem: readwrite, inaccessiblemem:
 ; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_INSERT_MASK:%.*]] = and i32 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_INSERT_INSERT]], -256
 ; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_INSERT_INSERT:%.*]] = or i32 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_INSERT_MASK]], [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_INSERT_EXT]]
 ; SROA-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_INSERT_INSERT]], 1
-; SROA-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, i32 poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT38]], [8 x i32] poison, [2 x i32] [[DOTFCA_1_INSERT]]), !continuation.registercount [[META1]]
+; SROA-NEXT:    [[TMP1:%.*]] = call { i32, i32, [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [2 x i32] } (...) @lgc.cps.await__sl_i32i32s_struct.DispatchSystemDatasa8i32a2i32s(i32 2, i32 4, i32 3, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [8 x i32] poison, [2 x i32] [[DOTFCA_1_INSERT]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
+; SROA-NEXT:    [[TMP2:%.*]] = extractvalue { i32, i32, [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [2 x i32] } [[TMP1]], 4
+; SROA-NEXT:    [[DOTFCA_0_EXTRACT11:%.*]] = extractvalue [2 x i32] [[TMP2]], 0
+; SROA-NEXT:    [[DOTFCA_1_EXTRACT13:%.*]] = extractvalue [2 x i32] [[TMP2]], 1
+; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_EXTRACT_TRUNC25:%.*]] = trunc i32 [[DOTFCA_1_EXTRACT13]] to i8
+; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_EXTRACT_SHIFT29:%.*]] = lshr i32 [[DOTFCA_1_EXTRACT13]], 8
+; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_EXTRACT_TRUNC30:%.*]] = trunc i32 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_EXTRACT_SHIFT29]] to i24
+; SROA-NEXT:    [[TMP4:%.*]] = freeze [[STRUCT_MYPARAMS:%.*]] poison
+; SROA-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_MYPARAMS]] [[TMP4]], 0
+; SROA-NEXT:    [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [[STRUCT_MYPARAMS]] [[TMP4]], 1
+; SROA-NEXT:    store i1 [[DOTFCA_1_EXTRACT]], ptr [[DOTSROA_5]], align 4
+; SROA-NEXT:    store i8 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_EXTRACT_TRUNC25]], ptr [[DOTSROA_5]], align 4
+; SROA-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i32, [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [2 x i32] } [[TMP1]], 2
+; SROA-NEXT:    [[DOTFCA_0_EXTRACT27:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP3]], 0
+; SROA-NEXT:    [[DOTSROA_5_0__SROA_5_4_9:%.*]] = load i8, ptr [[DOTSROA_5]], align 4
+; SROA-NEXT:    [[DOTFCA_0_INSERT26:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT27]], 0
+; SROA-NEXT:    [[DOTFCA_0_INSERT17:%.*]] = insertvalue [2 x i32] poison, i32 [[DOTFCA_0_EXTRACT11]], 0
+; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_INSERT_EXT31:%.*]] = zext i24 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_EXTRACT_TRUNC30]] to i32
+; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_INSERT_SHIFT32:%.*]] = shl i32 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_INSERT_EXT31]], 8
+; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_INSERT_MASK33:%.*]] = and i32 undef, 255
+; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_INSERT_INSERT34:%.*]] = or i32 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_INSERT_MASK33]], [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_INSERT_SHIFT32]]
+; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_INSERT_EXT26:%.*]] = zext i8 [[DOTSROA_5_0__SROA_5_4_9]] to i32
+; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_INSERT_MASK27:%.*]] = and i32 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_INSERT_INSERT34]], -256
+; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_INSERT_INSERT28:%.*]] = or i32 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_INSERT_MASK27]], [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_INSERT_EXT26]]
+; SROA-NEXT:    [[DOTFCA_1_INSERT20:%.*]] = insertvalue [2 x i32] [[DOTFCA_0_INSERT17]], i32 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_INSERT_INSERT28]], 1
+; SROA-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, i32 poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT26]], [8 x i32] poison, [2 x i32] [[DOTFCA_1_INSERT20]]), !continuation.registercount [[META1]]
 ; SROA-NEXT:    unreachable
 ;
diff --git a/llvmraytracing/test/lgccps/cleanup-store-loads.ll b/llvmraytracing/test/lgccps/cleanup-store-loads.ll
index 6f7ee9702e..3d185f2fed 100644
--- a/llvmraytracing/test/lgccps/cleanup-store-loads.ll
+++ b/llvmraytracing/test/lgccps/cleanup-store-loads.ll
@@ -65,7 +65,7 @@ define void @loadAtOffsetF32(ptr %data, i32 %offset) #4 {
 
 %test.Frame = type { i32, float, [100 x i32] }
 
-define { ptr, ptr } @test({} %state, i32 %rcr, float %arg, ptr %0) !lgc.cps !0 !continuation !1 {
+define { ptr, ptr } @test(i32 %shaderIndex, i32 %rcr, float %arg, ptr %0) !lgc.cps !0 !continuation !1 {
 entry:
   %1 = call ptr @continuation.malloc(i32 408)
   store ptr %1, ptr %0, align 8
@@ -115,7 +115,7 @@ entry:
 
 bb1:                                              ; preds = %entry
   %2 = inttoptr i32 %cr to ptr
-  %3 = call ptr %2(i32 %cr, i32 2, float %arg), !continuation.returnedRegistercount !{i32 0}
+  %3 = call ptr %2(i32 %cr, i32 3, i32 2, float %arg), !continuation.returnedRegistercount !{i32 0}
   %4 = insertvalue { ptr, ptr } undef, ptr @test.resume.0, 0
   %5 = insertvalue { ptr, ptr } %4, ptr %3, 1
   ret { ptr, ptr } %5
@@ -159,21 +159,21 @@ bb2:                                              ; preds = %entry
   ; Multiple loads can be optimized away
   call void @loadAtOffsetI32(ptr %data, i32 48)
 
-  call void (...) @lgc.cps.jump(i32 %rcr.reload, i32 2,  i32 poison, i32 poison, float %returnvalue)
+  call void (...) @lgc.cps.jump(i32 %rcr.reload, i32 2, i32 poison, i32 poison, i32 poison, float %returnvalue)
   unreachable
 }
 
 define internal { ptr, ptr } @test.resume.0(ptr noalias noundef nonnull align 4 dereferenceable(8) %0, i1 %1) !lgc.cps !0 !continuation !1 {
 entryresume.0:
   %2 = load ptr, ptr %0, align 8
-  %3 = call { float } @lgc.ilcps.getReturnValue__f32()
-  %res = extractvalue { float } %3, 0
+  %3 = call { i32, float } @lgc.ilcps.getReturnValue__i32_f32()
+  %ret.arg = extractvalue { i32, float } %3, 1
   %arg.reload.addr = getelementptr inbounds %test.Frame, ptr %2, i32 0, i32 1
   %arg.reload = load float, ptr %arg.reload.addr, align 4
   %rcr.reload.addr = getelementptr inbounds %test.Frame, ptr %2, i32 0, i32 0
   %rcr.reload = load i32, ptr %rcr.reload.addr, align 4
-  %returnvalue = fmul float %res, %arg.reload
-  call void (...) @lgc.cps.jump(i32 %rcr.reload, i32 2,  i32 poison, i32 poison, float %returnvalue)
+  %returnvalue = fmul float %ret.arg, %arg.reload
+  call void (...) @lgc.cps.jump(i32 %rcr.reload, i32 2, i32 poison, i32 poison, i32 poison, float %returnvalue)
   unreachable
 }
 
@@ -198,7 +198,7 @@ declare ptr @llvm.coro.begin(token, ptr writeonly) #1
 declare i1 @llvm.coro.suspend.retcon.i1(...) #1
 
 ; Function Attrs: nounwind willreturn
-declare { float } @lgc.ilcps.getReturnValue__f32() #2
+declare { i32, float } @lgc.ilcps.getReturnValue__i32_f32() #2
 
 ; Function Attrs: noreturn
 declare void @continuation.return(...) #3
@@ -263,7 +263,7 @@ attributes #4 = { alwaysinline }
 ;
 ;
 ; CHECK-LABEL: define void @test(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], {} [[STATE:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]]) !lgc.cps [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] !continuation.stacksize [[META3:![0-9]+]] !continuation.state [[META3]] {
+; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[SHADERINDEX:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]]) !lgc.cps [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] !continuation.stacksize [[META3:![0-9]+]] !continuation.state [[META3]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP51:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[TMP51]], align 4
@@ -355,7 +355,7 @@ attributes #4 = { alwaysinline }
 ; CHECK-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[CR]] to ptr
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @test.resume.0)
 ; CHECK-NEXT:    [[TMP52:%.*]] = load i32, ptr [[TMP51]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP52]], i32 [[TMP1]], float [[ARG]]), !continuation.returnedRegistercount [[META4:![0-9]+]]
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 3, i32 [[TMP52]], i32 2, i32 [[TMP1]], float [[ARG]]), !continuation.returnedRegistercount [[META4:![0-9]+]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[T0_BB2:%.*]] = phi float [ [[T0]], [[ENTRY:%.*]] ]
@@ -403,19 +403,20 @@ attributes #4 = { alwaysinline }
 ; CHECK-NEXT:    [[TMP74:%.*]] = add i32 [[TMP76]], -408
 ; CHECK-NEXT:    store i32 [[TMP74]], ptr [[TMP51]], align 4
 ; CHECK-NEXT:    [[TMP77:%.*]] = load i32, ptr [[TMP51]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR]], i32 2, i32 [[TMP77]], i32 poison, float [[RETURNVALUE]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR]], i32 2, i32 [[TMP77]], i32 poison, i32 poison, float [[RETURNVALUE]])
 ; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK-LABEL: define dso_local void @test.resume.0(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], float [[TMP2:%.*]]) !lgc.cps [[META1]] !continuation [[META2]] !continuation.registercount [[META4]] {
+; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]], float [[TMP1:%.*]]) !lgc.cps [[META1]] !continuation [[META2]] !continuation.registercount [[META4]] {
 ; CHECK-NEXT:  entryresume.0:
 ; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], -408
-; CHECK-NEXT:    [[TMP13:%.*]] = insertvalue { float } poison, float [[TMP2]], 0
-; CHECK-NEXT:    [[RES:%.*]] = extractvalue { float } [[TMP13]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = insertvalue { i32, float } poison, i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue { i32, float } [[TMP13]], float [[TMP1]], 1
+; CHECK-NEXT:    [[RET_ARG1:%.*]] = extractvalue { i32, float } [[TMP14]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i32 [[TMP5]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP6]], i32 0
@@ -423,11 +424,11 @@ attributes #4 = { alwaysinline }
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP8]], i32 0
 ; CHECK-NEXT:    [[RCR_RELOAD:%.*]] = load i32, ptr addrspace(5) [[TMP9]], align 4
-; CHECK-NEXT:    [[RETURNVALUE:%.*]] = fmul float [[RES]], [[ARG_RELOAD]]
+; CHECK-NEXT:    [[RETURNVALUE:%.*]] = fmul float [[RET_ARG1]], [[ARG_RELOAD]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], -408
 ; CHECK-NEXT:    store i32 [[TMP11]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, i32 [[TMP12]], i32 poison, float [[RETURNVALUE]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, i32 [[TMP12]], i32 poison, i32 poison, float [[RETURNVALUE]])
 ; CHECK-NEXT:    unreachable
 ;
diff --git a/llvmraytracing/test/lgccps/cps-no-await.ll b/llvmraytracing/test/lgccps/cps-no-await.ll
index e1862dfb4a..566a797650 100644
--- a/llvmraytracing/test/lgccps/cps-no-await.ll
+++ b/llvmraytracing/test/lgccps/cps-no-await.ll
@@ -5,7 +5,7 @@ define void @_cont_Traversal() !lgc.cps !{i32 2} !continuation !{ptr @_cont_Trav
   %pushconst = call ptr addrspace(4) @lgc.user.data(i32 32)
   %fn = load ptr, ptr addrspace(4) %pushconst
   %cr = ptrtoint ptr %fn to i32
-  call void (...) @lgc.cps.jump(i32 %cr, i32 2,  i32 poison)
+  call void (...) @lgc.cps.jump(i32 %cr, i32 2, i32 poison)
   unreachable
 }
 
@@ -16,7 +16,7 @@ declare void @lgc.cps.jump(...)
 ; LOWER-AWAIT-LABEL: define { ptr, ptr } @_cont_Traversal(
 ; LOWER-AWAIT-SAME: ptr [[TMP0:%.*]]) !lgc.cps [[META0:![0-9]+]] !continuation [[META1:![0-9]+]] {
 ; LOWER-AWAIT-NEXT:    [[TMP2:%.*]] = call token @llvm.coro.id.retcon(i32 8, i32 4, ptr [[TMP0]], ptr @continuation.prototype._cont_Traversal, ptr @continuation.malloc, ptr @continuation.free)
-; LOWER-AWAIT-NEXT:    [[TMP3:%.*]] = call ptr @llvm.coro.begin(token [[TMP2]], ptr null)
+; LOWER-AWAIT-NEXT:    [[TMP3:%.*]] = call ptr @llvm.coro.begin.custom.abi(token [[TMP2]], ptr null, i32 0)
 ; LOWER-AWAIT-NEXT:    [[PUSHCONST:%.*]] = call ptr addrspace(4) @lgc.user.data(i32 32)
 ; LOWER-AWAIT-NEXT:    [[FN:%.*]] = load ptr, ptr addrspace(4) [[PUSHCONST]], align 8
 ; LOWER-AWAIT-NEXT:    [[CR:%.*]] = ptrtoint ptr [[FN]] to i32
diff --git a/llvmraytracing/test/lgccps/entry-point-with-cps.ll b/llvmraytracing/test/lgccps/entry-point-with-cps.ll
index 9648ddfc11..d2fa22cd3e 100644
--- a/llvmraytracing/test/lgccps/entry-point-with-cps.ll
+++ b/llvmraytracing/test/lgccps/entry-point-with-cps.ll
@@ -15,7 +15,7 @@ define void @_cont_KernelEntry() #0 !lgc.rt.shaderstage !{i32 7} {
   unreachable
 }
 
-define spir_func void @raygen({} %state, i32 %rcr) !lgc.shaderstage !{i32 7} !lgc.cps !{i32 0} {
+define spir_func void @raygen(i32 %levels, i32 %shaderIndex, i32 %rcr) !lgc.shaderstage !{i32 7} !lgc.cps !{i32 0} {
   %pushconst = call ptr addrspace(4) @lgc.user.data(i32 0)
   %fn = load ptr, ptr addrspace(4) %pushconst
   %p8 = getelementptr i8, ptr addrspace(4) %pushconst, i32 8
@@ -25,26 +25,24 @@ define spir_func void @raygen({} %state, i32 %rcr) !lgc.shaderstage !{i32 7} !lg
 
   %cr.0 = ptrtoint ptr %fn to i32
   %cr.1 = or i32 %cr.0, 2
-  %r = call { [2 x i32] } (...) @lgc.cps.await__a2i32(i32 %cr.1, i32 4, i32 %x, ptr addrspace(1) %dst), !continuation.returnedRegistercount !{i32 0}
-  %res = extractvalue { [2 x i32] } %r, 0
-
-  store [2 x i32] %res, ptr addrspace(1) %dst
+  %r = call { i32, [2 x i32] } (...) @lgc.cps.await__i32_a2i32(i32 %cr.1, i32 4, i32 %x, ptr addrspace(1) %dst), !continuation.returnedRegistercount !{i32 0}
+  %r.ret = extractvalue { i32, [2 x i32] } %r, 1
+  store [2 x i32] %r.ret, ptr addrspace(1) %dst
 
   ; Note: RGS returns, meaning end of thread.
   call void @lgc.cps.complete()
   unreachable
 }
 
-define spir_func void @chs({} %state, i32 %rcr, i32 %x) !lgc.shaderstage !{i32 7} !lgc.cps !{i32 1} {
+define spir_func void @chs(i32 %rcr, i32 %x) !lgc.shaderstage !{i32 7} !lgc.cps !{i32 1} {
   %pushconst = call ptr addrspace(4) @lgc.user.data(i32 24)
   %fn = load ptr, ptr addrspace(4) %pushconst
 
   %cr.0 = ptrtoint ptr %fn to i32
   %cr.1 = or i32 %cr.0, 1
   %y = call { i32 } (...) @lgc.cps.await__i32(i32 %cr.1, i32 2, i32 %x), !continuation.returnedRegistercount !{i32 0}
-  %res = extractvalue { i32 } %y, 0
-
-  call void (...) @lgc.cps.jump(i32 %rcr, i32 5, i32 %res, i32 poison, i32 poison)
+  %y.ret = extractvalue { i32 } %y, 0
+  call void (...) @lgc.cps.jump(i32 %rcr, i32 5, i32 poison, i32 poison, i32 poison, i32 %y.ret)
   unreachable
 }
 
@@ -60,7 +58,7 @@ main:
   %fn = load ptr, ptr addrspace(4) %pushconst
 
   %cr.0 = ptrtoint ptr %fn to i32
-  call void (...) @lgc.cps.jump(i32 %cr.0, i32 1, i32 5, i32 poison, i32 poison)
+  call void (...) @lgc.cps.jump(i32 %cr.0, i32 1, i32 poison, i32 5, i32 poison, i32 poison)
 
   br label %exit
 
@@ -74,7 +72,7 @@ declare ptr addrspace(4) @lgc.user.data(i32)
 declare <3 x i32> @lgc.shader.input.LocalInvocationId(i32)
 declare void @lgc.cps.await__isVoid(...)
 declare { i32 } @lgc.cps.await__i32(...)
-declare { [2 x i32] } @lgc.cps.await__a2i32(...)
+declare { i32, [2 x i32] } @lgc.cps.await__i32_a2i32(...)
 declare void @lgc.cps.jump(...)
 
 !continuation.stackAddrspace = !{!0}
@@ -88,7 +86,7 @@ declare void @lgc.cps.jump(...)
 ;
 ;
 ; CHECK-LABEL: define spir_func void @raygen(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], {} [[STATE:%.*]], i32 [[RCR:%.*]]) !lgc.shaderstage [[META1]] !lgc.cps [[META2:![0-9]+]] !continuation [[META3:![0-9]+]] !continuation.state [[META2]] {
+; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[LEVELS:%.*]], i32 [[SHADERINDEX:%.*]], i32 [[RCR:%.*]]) !lgc.shaderstage [[META1]] !lgc.cps [[META2:![0-9]+]] !continuation [[META3:![0-9]+]] !continuation.state [[META2]] {
 ; CHECK-NEXT:  AllocaSpillBB:
 ; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -103,26 +101,27 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[CR_1]] to ptr
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @raygen.resume.0)
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR_1]], i32 4, i32 [[TMP2]], i32 [[TMP1]], i32 [[X]], ptr addrspace(1) [[DST]]), !continuation.returnedRegistercount [[META2]]
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR_1]], i32 4, i32 [[TMP2]], i32 [[X]], i32 [[TMP1]], ptr addrspace(1) [[DST]]), !continuation.returnedRegistercount [[META2]]
 ; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK-LABEL: define dso_local void @raygen.resume.0(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], [2 x i32] [[TMP2:%.*]]) !lgc.shaderstage [[META1]] !lgc.cps [[META2]] !continuation [[META3]] !continuation.registercount [[META2]] {
+; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]], [2 x i32] [[TMP1:%.*]]) !lgc.shaderstage [[META1]] !lgc.cps [[META2]] !continuation [[META3]] !continuation.registercount [[META2]] {
 ; CHECK-NEXT:  entryresume.0:
 ; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { [2 x i32] } poison, [2 x i32] [[TMP2]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { i32, [2 x i32] } poison, i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { i32, [2 x i32] } [[TMP2]], [2 x i32] [[TMP1]], 1
 ; CHECK-NEXT:    [[PUSHCONST3:%.*]] = call ptr addrspace(4) @lgc.user.data(i32 0)
 ; CHECK-NEXT:    [[P162:%.*]] = getelementptr i8, ptr addrspace(4) [[PUSHCONST3]], i32 16
 ; CHECK-NEXT:    [[DST1:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[P162]], align 8
-; CHECK-NEXT:    [[RES1:%.*]] = extractvalue { [2 x i32] } [[TMP3]], 0
-; CHECK-NEXT:    store [2 x i32] [[RES1]], ptr addrspace(1) [[DST1]], align 4
+; CHECK-NEXT:    [[R_RET1:%.*]] = extractvalue { i32, [2 x i32] } [[TMP4]], 1
+; CHECK-NEXT:    store [2 x i32] [[R_RET1]], ptr addrspace(1) [[DST1]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define spir_func void @chs(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], {} [[STATE:%.*]], i32 [[RCR:%.*]], i32 [[X:%.*]]) !lgc.shaderstage [[META1]] !lgc.cps [[META4:![0-9]+]] !continuation [[META5:![0-9]+]] !continuation.stacksize [[META6:![0-9]+]] !continuation.state [[META6]] {
+; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[RCR:%.*]], i32 [[X:%.*]]) !lgc.shaderstage [[META1]] !lgc.cps [[META4:![0-9]+]] !continuation [[META5:![0-9]+]] !continuation.stacksize [[META6:![0-9]+]] !continuation.state [[META6]] {
 ; CHECK-NEXT:  AllocaSpillBB:
 ; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -139,18 +138,18 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[CR_1]] to ptr
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @chs.resume.0)
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR_1]], i32 2, i32 [[TMP6]], i32 [[TMP1]], i32 [[X]]), !continuation.returnedRegistercount [[META2]]
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR_1]], i32 2, i32 [[TMP6]], i32 [[X]], i32 [[TMP1]]), !continuation.returnedRegistercount [[META2]]
 ; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK-LABEL: define dso_local void @chs.resume.0(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]]) !lgc.shaderstage [[META1]] !lgc.cps [[META4]] !continuation [[META5]] !continuation.registercount [[META2]] {
+; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]]) !lgc.shaderstage [[META1]] !lgc.cps [[META4]] !continuation [[META5]] !continuation.registercount [[META2]] {
 ; CHECK-NEXT:  entryresume.0:
 ; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], -8
-; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue { i32 } poison, i32 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue { i32 } poison, i32 [[TMP0]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP5]], i32 0
 ; CHECK-NEXT:    [[RCR_RELOAD:%.*]] = load i32, ptr addrspace(5) [[TMP6]], align 4
@@ -159,7 +158,7 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], -8
 ; CHECK-NEXT:    store i32 [[TMP8]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 5, i32 [[TMP9]], i32 poison, i32 poison)
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 5, i32 [[TMP9]], i32 poison, i32 poison, i32 [[RES]])
 ; CHECK-NEXT:    unreachable
 ;
 ;
@@ -176,7 +175,7 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    [[FN:%.*]] = load ptr, ptr addrspace(4) [[PUSHCONST]], align 8
 ; CHECK-NEXT:    [[CR_0:%.*]] = ptrtoint ptr [[FN]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR_0]], i32 1, i32 [[TMP3]], i32 poison, i32 poison)
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR_0]], i32 1, i32 [[TMP3]], i32 5, i32 poison, i32 poison)
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
@@ -189,9 +188,9 @@ declare void @lgc.cps.jump(...)
 ;
 ;
 ; LOWER-AWAIT-LABEL: define spir_func { ptr, ptr } @raygen(
-; LOWER-AWAIT-SAME: {} [[STATE:%.*]], i32 [[RCR:%.*]], ptr [[TMP0:%.*]]) !lgc.shaderstage [[META1]] !lgc.cps [[META2:![0-9]+]] !continuation [[META3:![0-9]+]] {
+; LOWER-AWAIT-SAME: i32 [[LEVELS:%.*]], i32 [[SHADERINDEX:%.*]], i32 [[RCR:%.*]], ptr [[TMP0:%.*]]) !lgc.shaderstage [[META1]] !lgc.cps [[META2:![0-9]+]] !continuation [[META3:![0-9]+]] {
 ; LOWER-AWAIT-NEXT:    [[TMP2:%.*]] = call token @llvm.coro.id.retcon(i32 8, i32 4, ptr [[TMP0]], ptr @continuation.prototype.raygen, ptr @continuation.malloc, ptr @continuation.free)
-; LOWER-AWAIT-NEXT:    [[TMP3:%.*]] = call ptr @llvm.coro.begin(token [[TMP2]], ptr null)
+; LOWER-AWAIT-NEXT:    [[TMP3:%.*]] = call ptr @llvm.coro.begin.custom.abi(token [[TMP2]], ptr null, i32 0)
 ; LOWER-AWAIT-NEXT:    [[PUSHCONST:%.*]] = call ptr addrspace(4) @lgc.user.data(i32 0)
 ; LOWER-AWAIT-NEXT:    [[FN:%.*]] = load ptr, ptr addrspace(4) [[PUSHCONST]], align 8
 ; LOWER-AWAIT-NEXT:    [[P8:%.*]] = getelementptr i8, ptr addrspace(4) [[PUSHCONST]], i32 8
@@ -203,17 +202,17 @@ declare void @lgc.cps.jump(...)
 ; LOWER-AWAIT-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[CR_1]] to ptr
 ; LOWER-AWAIT-NEXT:    [[TMP5:%.*]] = call ptr [[TMP4]](i32 [[CR_1]], i32 4, i32 [[X]], ptr addrspace(1) [[DST]]), !continuation.returnedRegistercount [[META2]]
 ; LOWER-AWAIT-NEXT:    [[TMP6:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TMP5]])
-; LOWER-AWAIT-NEXT:    [[TMP8:%.*]] = call { [2 x i32] } @lgc.ilcps.getReturnValue__sl_a2i32s()
-; LOWER-AWAIT-NEXT:    [[TMP7:%.*]] = extractvalue { [2 x i32] } [[TMP8]], 0
+; LOWER-AWAIT-NEXT:    [[TMP8:%.*]] = call { i32, [2 x i32] } @lgc.ilcps.getReturnValue__sl_i32a2i32s()
+; LOWER-AWAIT-NEXT:    [[TMP7:%.*]] = extractvalue { i32, [2 x i32] } [[TMP8]], 1
 ; LOWER-AWAIT-NEXT:    store [2 x i32] [[TMP7]], ptr addrspace(1) [[DST]], align 4
 ; LOWER-AWAIT-NEXT:    call void @lgc.cps.complete()
 ; LOWER-AWAIT-NEXT:    unreachable
 ;
 ;
 ; LOWER-AWAIT-LABEL: define spir_func { ptr, ptr } @chs(
-; LOWER-AWAIT-SAME: {} [[STATE:%.*]], i32 [[RCR:%.*]], i32 [[X:%.*]], ptr [[TMP0:%.*]]) !lgc.shaderstage [[META1]] !lgc.cps [[META4:![0-9]+]] !continuation [[META5:![0-9]+]] {
+; LOWER-AWAIT-SAME: i32 [[RCR:%.*]], i32 [[X:%.*]], ptr [[TMP0:%.*]]) !lgc.shaderstage [[META1]] !lgc.cps [[META4:![0-9]+]] !continuation [[META5:![0-9]+]] {
 ; LOWER-AWAIT-NEXT:    [[TMP2:%.*]] = call token @llvm.coro.id.retcon(i32 8, i32 4, ptr [[TMP0]], ptr @continuation.prototype.chs, ptr @continuation.malloc, ptr @continuation.free)
-; LOWER-AWAIT-NEXT:    [[TMP3:%.*]] = call ptr @llvm.coro.begin(token [[TMP2]], ptr null)
+; LOWER-AWAIT-NEXT:    [[TMP3:%.*]] = call ptr @llvm.coro.begin.custom.abi(token [[TMP2]], ptr null, i32 0)
 ; LOWER-AWAIT-NEXT:    [[PUSHCONST:%.*]] = call ptr addrspace(4) @lgc.user.data(i32 24)
 ; LOWER-AWAIT-NEXT:    [[FN:%.*]] = load ptr, ptr addrspace(4) [[PUSHCONST]], align 8
 ; LOWER-AWAIT-NEXT:    [[CR_0:%.*]] = ptrtoint ptr [[FN]] to i32
@@ -223,7 +222,7 @@ declare void @lgc.cps.jump(...)
 ; LOWER-AWAIT-NEXT:    [[TMP6:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TMP5]])
 ; LOWER-AWAIT-NEXT:    [[TMP8:%.*]] = call { i32 } @lgc.ilcps.getReturnValue__sl_i32s()
 ; LOWER-AWAIT-NEXT:    [[TMP7:%.*]] = extractvalue { i32 } [[TMP8]], 0
-; LOWER-AWAIT-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR]], i32 5, i32 [[TMP7]], i32 poison, i32 poison)
+; LOWER-AWAIT-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR]], i32 5, i32 poison, i32 poison, i32 poison, i32 [[TMP7]])
 ; LOWER-AWAIT-NEXT:    unreachable
 ;
 ;
@@ -238,7 +237,7 @@ declare void @lgc.cps.jump(...)
 ; LOWER-AWAIT-NEXT:    [[PUSHCONST:%.*]] = call ptr addrspace(4) @lgc.user.data(i32 32)
 ; LOWER-AWAIT-NEXT:    [[FN:%.*]] = load ptr, ptr addrspace(4) [[PUSHCONST]], align 8
 ; LOWER-AWAIT-NEXT:    [[CR_0:%.*]] = ptrtoint ptr [[FN]] to i32
-; LOWER-AWAIT-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR_0]], i32 1, i32 5, i32 poison, i32 poison)
+; LOWER-AWAIT-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR_0]], i32 1, i32 poison, i32 5, i32 poison, i32 poison)
 ; LOWER-AWAIT-NEXT:    br label [[EXIT]]
 ; LOWER-AWAIT:       exit:
 ; LOWER-AWAIT-NEXT:    call void @lgc.cps.complete()
diff --git a/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-get-i32.ll b/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-get-i32.ll
index e8fbc941c2..7fceb15542 100644
--- a/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-get-i32.ll
+++ b/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-get-i32.ll
@@ -5,10 +5,10 @@
 %struct.DispatchSystemData = type { i32 }
 
 ; Need _cont_ReportHit to get system data type
+declare !pointeetys !10 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
 declare !pointeetys !6 i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind)
 
-declare !pointeetys !10 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
-declare !pointeetys !10 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
+declare !pointeetys !10 void @_cont_ExitRayGen(%struct.DispatchSystemData*)
 
 declare i32 @_AmdContPayloadRegistersGetI32(i32)
 
@@ -16,7 +16,7 @@ declare i32 @_AmdContPayloadRegistersGetI32(i32)
 
 define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @_cont_Traversal(ptr addrspace(5) %0) local_unnamed_addr !lgc.shaderstage !0 !pointeetys !1 !lgc.rt.shaderstage !3 {
 ; CHECK-LABEL: define dso_local spir_func void @_cont_Traversal(
-; CHECK-SAME: i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], { { i32 } } [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [41 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) local_unnamed_addr !lgc.shaderstage [[META3:![0-9]+]] !lgc.rt.shaderstage [[META4:![0-9]+]] !continuation.registercount [[META0:![0-9]+]] !lgc.cps [[META5:![0-9]+]] !continuation [[META6:![0-9]+]] {
+; CHECK-SAME: i32 [[SHADERINDEX:%.*]], i32 [[RETURNADDR:%.*]], { { i32 } } [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [41 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) local_unnamed_addr !lgc.shaderstage [[META3:![0-9]+]] !lgc.rt.shaderstage [[META4:![0-9]+]] !continuation.registercount [[META0:![0-9]+]] !lgc.cps [[META5:![0-9]+]] !continuation [[META6:![0-9]+]] {
 ; CHECK-NEXT:  .entry:
 ; CHECK-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca { { i32 } }, align 8, addrspace(5)
 ; CHECK-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [30 x i32], align 4
diff --git a/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-i32-count.ll b/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-i32-count.ll
index 603e9670e5..75cc839391 100644
--- a/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-i32-count.ll
+++ b/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-i32-count.ll
@@ -5,10 +5,10 @@
 %struct.DispatchSystemData = type { i32 }
 
 ; Need _cont_ReportHit to get system data type
+declare !pointeetys !10 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
 declare !pointeetys !6 i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind)
 
-declare !pointeetys !10 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
-declare !pointeetys !10 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
+declare !pointeetys !10 void @_cont_ExitRayGen(%struct.DispatchSystemData*)
 
 declare i32 @_AmdContPayloadRegistersI32Count()
 
@@ -16,7 +16,7 @@ declare i32 @_AmdContPayloadRegistersI32Count()
 
 define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @_cont_Traversal(ptr addrspace(5) %0) local_unnamed_addr !lgc.shaderstage !0 !pointeetys !1 !lgc.rt.shaderstage !3 {
 ; CHECK-LABEL: define dso_local spir_func void @_cont_Traversal(
-; CHECK-SAME: i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], { { i32 } } [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [41 x i32] [[PADDING:%.*]], [11 x i32] [[PAYLOAD:%.*]]) local_unnamed_addr !lgc.shaderstage [[META4:![0-9]+]] !lgc.rt.shaderstage [[META5:![0-9]+]] !continuation.registercount [[META1:![0-9]+]] !lgc.cps [[META6:![0-9]+]] !continuation [[META7:![0-9]+]] {
+; CHECK-SAME: i32 [[SHADERINDEX:%.*]], i32 [[RETURNADDR:%.*]], { { i32 } } [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [41 x i32] [[PADDING:%.*]], [11 x i32] [[PAYLOAD:%.*]]) local_unnamed_addr !lgc.shaderstage [[META4:![0-9]+]] !lgc.rt.shaderstage [[META5:![0-9]+]] !continuation.registercount [[META1:![0-9]+]] !lgc.cps [[META6:![0-9]+]] !continuation [[META7:![0-9]+]] {
 ; CHECK-NEXT:  .entry:
 ; CHECK-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca { { i32 } }, align 8, addrspace(5)
 ; CHECK-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [11 x i32], align 4
diff --git a/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-set-i32.ll b/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-set-i32.ll
index cdef1d2c52..48db4658b1 100644
--- a/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-set-i32.ll
+++ b/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-set-i32.ll
@@ -5,16 +5,16 @@
 %struct.DispatchSystemData = type { i32 }
 
 ; Need _cont_ReportHit to get system data type
+declare !pointeetys !10 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
 declare !pointeetys !6 i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind)
 
-declare !pointeetys !10 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
-declare !pointeetys !10 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
+declare !pointeetys !10 void @_cont_ExitRayGen(%struct.DispatchSystemData*)
 
 declare void @_AmdContPayloadRegistersSetI32(i32, i32)
 
 define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @_cont_Traversal(ptr addrspace(5) %0) local_unnamed_addr !lgc.shaderstage !0 !pointeetys !1 !lgc.rt.shaderstage !3 {
 ; CHECK-LABEL: define dso_local spir_func void @_cont_Traversal(
-; CHECK-SAME: i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], { { i32 } } [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [41 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) local_unnamed_addr !lgc.shaderstage [[META3:![0-9]+]] !lgc.rt.shaderstage [[META4:![0-9]+]] !continuation.registercount [[META0:![0-9]+]] !lgc.cps [[META5:![0-9]+]] !continuation [[META6:![0-9]+]] {
+; CHECK-SAME: i32 [[SHADERINDEX:%.*]], i32 [[RETURNADDR:%.*]], { { i32 } } [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [41 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) local_unnamed_addr !lgc.shaderstage [[META3:![0-9]+]] !lgc.rt.shaderstage [[META4:![0-9]+]] !continuation.registercount [[META0:![0-9]+]] !lgc.cps [[META5:![0-9]+]] !continuation [[META6:![0-9]+]] {
 ; CHECK-NEXT:  .entry:
 ; CHECK-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca { { i32 } }, align 8, addrspace(5)
 ; CHECK-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [30 x i32], align 4
diff --git a/llvmraytracing/test/lgccps/lower-traversal.ll b/llvmraytracing/test/lgccps/lower-traversal.ll
deleted file mode 100644
index ac79fb5aed..0000000000
--- a/llvmraytracing/test/lgccps/lower-traversal.ll
+++ /dev/null
@@ -1,622 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; We run this test file twice with different max hit attribute sizes to test that e.g. padding depends correctly on the max hit attribute size.
-; RUN: grep -v HITATTR_SIZE_8  %s | opt --verify-each -passes='lower-raytracing-pipeline,lint' -S --lint-abort-on-error | FileCheck -check-prefix=CHECK-ATTRSIZE-16 %s
-; RUN: grep -v HITATTR_SIZE_16 %s | opt --verify-each -passes='lower-raytracing-pipeline,lint' -S --lint-abort-on-error | FileCheck -check-prefix=CHECK-ATTRSIZE-8 %s
-
-%struct.AnyHitTraversalData = type { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } }
-%struct.DispatchSystemData = type { i32 }
-
-; Need _cont_ReportHit to get system data type
-declare  !pointeetys !6 i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind)
-
-declare !pointeetys !10 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
-declare !pointeetys !10 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
-
-declare i32 @_AmdGetCurrentFuncAddr()
-
-define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @_cont_Traversal(ptr addrspace(5) %0) local_unnamed_addr !lgc.shaderstage !0 !pointeetys !1 !lgc.rt.shaderstage !3 {
-; CHECK-ATTRSIZE-16-LABEL: define dso_local spir_func void @_cont_Traversal(
-; CHECK-ATTRSIZE-16-SAME: i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [7 x i32] [[PADDING:%.*]], [8 x i32] [[PAYLOAD:%.*]]) local_unnamed_addr !lgc.shaderstage [[META5:![0-9]+]] !lgc.rt.shaderstage [[META6:![0-9]+]] !continuation.registercount [[META0:![0-9]+]] !lgc.cps [[META7:![0-9]+]] !continuation [[META8:![0-9]+]] {
-; CHECK-ATTRSIZE-16-NEXT:  .entry:
-; CHECK-ATTRSIZE-16-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, align 16, addrspace(5)
-; CHECK-ATTRSIZE-16-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [8 x i32], align 4
-; CHECK-ATTRSIZE-16-NEXT:    store [8 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-ATTRSIZE-16-NEXT:    store { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[SYSTEM_DATA]], ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], align 16
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 93, i32 17, i32 0, i32 0)
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP1:%.*]] = call ptr @llvm.invariant.start.p7(i64 -1, ptr addrspace(7) [[TMP0]])
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP2:%.*]] = load <3 x i32>, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], align 16
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP3:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 1
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(5) [[TMP3]], align 4
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP5:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 0
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP6:%.*]] = load i64, ptr addrspace(5) [[TMP5]], align 8
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP7:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 1
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP9:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 2
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(5) [[TMP9]], align 4
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFR539:%.*]] = freeze i32 [[TMP10]]
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP11:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP12:%.*]] = load <3 x float>, ptr addrspace(5) [[TMP11]], align 16
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP13:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 4
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP14:%.*]] = load <3 x float>, ptr addrspace(5) [[TMP13]], align 16
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP15:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 5
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP16:%.*]] = load float, ptr addrspace(5) [[TMP15]], align 4
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP17:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 6
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP18:%.*]] = load float, ptr addrspace(5) [[TMP17]], align 4
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP19:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 0, i32 0
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP20:%.*]] = load float, ptr addrspace(5) [[TMP19]], align 4
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP21:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 0, i32 1
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP22:%.*]] = load i32, ptr addrspace(5) [[TMP21]], align 4
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFR:%.*]] = freeze i32 [[TMP22]]
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP23:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 0, i32 2
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP24:%.*]] = load i32, ptr addrspace(5) [[TMP23]], align 4
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP25:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 0, i32 3
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(5) [[TMP25]], align 4
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP27:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 0, i32 4
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP28:%.*]] = load i32, ptr addrspace(5) [[TMP27]], align 4
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP29:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 1
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP30:%.*]] = load <2 x float>, ptr addrspace(5) [[TMP29]], align 8
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP31:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 2
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP32:%.*]] = load i32, ptr addrspace(5) [[TMP31]], align 4
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP33:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 3
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP34:%.*]] = load i32, ptr addrspace(5) [[TMP33]], align 4
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP35:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 4
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP36:%.*]] = load i32, ptr addrspace(5) [[TMP35]], align 4
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP37:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 5
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP38:%.*]] = load i32, ptr addrspace(5) [[TMP37]], align 4
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP39:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 6
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP40:%.*]] = load i32, ptr addrspace(5) [[TMP39]], align 4
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP41:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 7
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP42:%.*]] = load i32, ptr addrspace(5) [[TMP41]], align 4
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP43:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 8
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP44:%.*]] = load i64, ptr addrspace(5) [[TMP43]], align 8
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP45:%.*]] = icmp ugt i32 [[DOTFR]], -3
-; CHECK-ATTRSIZE-16-NEXT:    br i1 [[TMP45]], label [[TMP46:%.*]], label [[TMP67:%.*]]
-; CHECK-ATTRSIZE-16:       46:
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP47:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 5
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP48:%.*]] = load i32, ptr addrspace(7) [[TMP47]], align 4
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP49:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 6
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP50:%.*]] = load i32, ptr addrspace(7) [[TMP49]], align 4
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP51:%.*]] = zext i32 [[TMP50]] to i64
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP52:%.*]] = shl nuw i64 [[TMP51]], 32
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP53:%.*]] = zext i32 [[TMP48]] to i64
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP54:%.*]] = or i64 [[TMP52]], [[TMP53]]
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFR541:%.*]] = freeze i64 [[TMP54]]
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP55:%.*]] = icmp eq i64 [[DOTFR541]], 0
-; CHECK-ATTRSIZE-16-NEXT:    br i1 [[TMP55]], label [[DOTEXIT2:%.*]], label [[TMP56:%.*]]
-; CHECK-ATTRSIZE-16:       56:
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP57:%.*]] = lshr i32 [[TMP8]], 16
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP58:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 7
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP59:%.*]] = load i32, ptr addrspace(7) [[TMP58]], align 4
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP60:%.*]] = mul i32 [[TMP59]], [[TMP57]]
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP61:%.*]] = zext i32 [[TMP60]] to i64
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP62:%.*]] = add i64 [[DOTFR541]], [[TMP61]]
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP63:%.*]] = inttoptr i64 [[TMP62]] to ptr addrspace(4)
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP64:%.*]] = load i32, ptr addrspace(4) [[TMP63]], align 4
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP65:%.*]] = freeze i32 [[TMP64]]
-; CHECK-ATTRSIZE-16-NEXT:    br label [[DOTEXIT2]]
-; CHECK-ATTRSIZE-16:       .exit2:
-; CHECK-ATTRSIZE-16-NEXT:    [[DOT0:%.*]] = phi i32 [ [[TMP57]], [[TMP56]] ], [ undef, [[TMP46]] ]
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTSROA_0128_0_EXTRACT_TRUNC:%.*]] = phi i32 [ [[TMP65]], [[TMP56]] ], [ 0, [[TMP46]] ]
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTNOT542:%.*]] = icmp eq i32 [[DOTSROA_0128_0_EXTRACT_TRUNC]], 0
-; CHECK-ATTRSIZE-16-NEXT:    br i1 [[DOTNOT542]], label [[TMP107:%.*]], label [[TMP66:%.*]]
-; CHECK-ATTRSIZE-16:       66:
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTSROA_0130_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[TMP44]] to i32
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } poison, <3 x i32> [[TMP2]], 0, 0
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_0_0_INSERT]], i32 [[DOT0]], 0, 1
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_1_0_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_0_1_INSERT]], i64 [[TMP6]], 1, 0
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_1_1_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_0_INSERT]], i32 [[TMP8]], 1, 1
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_1_2_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_1_INSERT]], i32 [[DOTFR539]], 1, 2
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_1_3_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_2_INSERT]], <3 x float> [[TMP12]], 1, 3
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_1_4_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_3_INSERT]], <3 x float> [[TMP14]], 1, 4
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_1_5_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_4_INSERT]], float [[TMP16]], 1, 5
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_1_6_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_5_INSERT]], float [[TMP18]], 1, 6
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_0_0_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_6_INSERT]], float [[TMP20]], 2, 0, 0
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_0_1_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_0_INSERT]], i32 [[DOTFR]], 2, 0, 1
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_0_2_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_1_INSERT]], i32 [[TMP24]], 2, 0, 2
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_0_3_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_2_INSERT]], i32 [[TMP26]], 2, 0, 3
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_0_4_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_3_INSERT]], i32 [[TMP28]], 2, 0, 4
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_1_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_4_INSERT]], <2 x float> [[TMP30]], 2, 1
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_2_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_1_INSERT]], i32 [[TMP32]], 2, 2
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_3_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_2_INSERT]], i32 [[TMP34]], 2, 3
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_4_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_3_INSERT]], i32 [[TMP36]], 2, 4
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_5_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_4_INSERT]], i32 [[TMP38]], 2, 5
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_6_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_5_INSERT]], i32 [[TMP40]], 2, 6
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_7_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_6_INSERT]], i32 [[TMP42]], 2, 7
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_8_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_7_INSERT]], i64 [[TMP44]], 2, 8
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP109:%.*]] = load [8 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-ATTRSIZE-16-NEXT:    call void (...) @lgc.cps.jump(i32 [[DOTSROA_0128_0_EXTRACT_TRUNC]], i32 -1, i32 [[DOTSROA_0130_0_EXTRACT_TRUNC]], i32 poison, i32 [[DOT0]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_8_INSERT]], [7 x i32] poison, [8 x i32] [[TMP109]]), !continuation.registercount [[META0]]
-; CHECK-ATTRSIZE-16-NEXT:    unreachable
-; CHECK-ATTRSIZE-16:       68:
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP68:%.*]] = shl i32 [[DOTFR]], 3
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP69:%.*]] = and i32 [[TMP68]], -64
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP70:%.*]] = zext i32 [[TMP69]] to i64
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP71:%.*]] = add i64 [[TMP6]], [[TMP70]]
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP72:%.*]] = add i64 [[TMP71]], 48
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP73:%.*]] = inttoptr i64 [[TMP72]] to ptr addrspace(1)
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP74:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP73]], align 16
-; CHECK-ATTRSIZE-16-NEXT:    [[DOT4_VEC_EXTRACT452:%.*]] = extractelement <4 x i32> [[TMP74]], i64 1
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP75:%.*]] = and i32 [[TMP26]], 16777215
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP76:%.*]] = and i32 [[DOT4_VEC_EXTRACT452]], 16777215
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP77:%.*]] = lshr i32 [[TMP8]], 8
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP78:%.*]] = and i32 [[TMP77]], 15
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP79:%.*]] = lshr i32 [[TMP8]], 12
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP80:%.*]] = and i32 [[TMP79]], 15
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP81:%.*]] = mul nuw nsw i32 [[TMP80]], [[TMP75]]
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP82:%.*]] = add nuw nsw i32 [[TMP78]], [[TMP81]]
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP83:%.*]] = add nuw nsw i32 [[TMP82]], [[TMP76]]
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP84:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 9
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP85:%.*]] = load i32, ptr addrspace(7) [[TMP84]], align 4
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP86:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 10
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP87:%.*]] = load i32, ptr addrspace(7) [[TMP86]], align 4
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP88:%.*]] = zext i32 [[TMP87]] to i64
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP89:%.*]] = shl nuw i64 [[TMP88]], 32
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP90:%.*]] = zext i32 [[TMP85]] to i64
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP91:%.*]] = or i64 [[TMP89]], [[TMP90]]
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFR537:%.*]] = freeze i64 [[TMP91]]
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DOTFR537]], 0
-; CHECK-ATTRSIZE-16-NEXT:    br i1 [[DOTNOT]], label [[DOTEXIT5:%.*]], label [[TMP92:%.*]]
-; CHECK-ATTRSIZE-16:       93:
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP93:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 11
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP94:%.*]] = load i32, ptr addrspace(7) [[TMP93]], align 4
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP95:%.*]] = mul i32 [[TMP94]], [[TMP83]]
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP96:%.*]] = zext i32 [[TMP95]] to i64
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP97:%.*]] = add i64 [[DOTFR537]], [[TMP96]]
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP98:%.*]] = inttoptr i64 [[TMP97]] to ptr addrspace(1)
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP99:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP98]], align 16
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP100:%.*]] = shufflevector <4 x i32> [[TMP99]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP101:%.*]] = freeze <2 x i32> [[TMP100]]
-; CHECK-ATTRSIZE-16-NEXT:    br label [[DOTEXIT5]]
-; CHECK-ATTRSIZE-16:       .exit5:
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTSROA_0501_0:%.*]] = phi <2 x i32> [ [[TMP101]], [[TMP92]] ], [ zeroinitializer, [[TMP67]] ]
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP102:%.*]] = and i32 [[DOTFR539]], 8
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP103:%.*]] = icmp ne i32 [[TMP102]], 0
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTSROA_0150_0_VEC_EXTRACT:%.*]] = extractelement <2 x i32> [[DOTSROA_0501_0]], i64 0
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTNOT540:%.*]] = icmp eq i32 [[DOTSROA_0150_0_VEC_EXTRACT]], 0
-; CHECK-ATTRSIZE-16-NEXT:    [[OR_COND:%.*]] = or i1 [[TMP103]], [[DOTNOT540]]
-; CHECK-ATTRSIZE-16-NEXT:    br i1 [[OR_COND]], label [[TMP107]], label [[TMP104:%.*]]
-; CHECK-ATTRSIZE-16:       105:
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP106:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @_cont_Traversal)
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_0_0_INSERT322:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } poison, <3 x i32> [[TMP2]], 0, 0
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_0_1_INSERT323:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_0_0_INSERT322]], i32 [[TMP83]], 0, 1
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_1_0_INSERT324:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_0_1_INSERT323]], i64 [[TMP6]], 1, 0
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_1_1_INSERT325:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_0_INSERT324]], i32 [[TMP8]], 1, 1
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_1_2_INSERT326:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_1_INSERT325]], i32 [[DOTFR539]], 1, 2
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_1_3_INSERT327:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_2_INSERT326]], <3 x float> [[TMP12]], 1, 3
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_1_4_INSERT328:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_3_INSERT327]], <3 x float> [[TMP14]], 1, 4
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_1_5_INSERT329:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_4_INSERT328]], float [[TMP16]], 1, 5
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_1_6_INSERT330:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_5_INSERT329]], float [[TMP18]], 1, 6
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_0_0_INSERT331:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_6_INSERT330]], float [[TMP20]], 2, 0, 0
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_0_1_INSERT332:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_0_INSERT331]], i32 [[DOTFR]], 2, 0, 1
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_0_2_INSERT333:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_1_INSERT332]], i32 [[TMP24]], 2, 0, 2
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_0_3_INSERT334:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_2_INSERT333]], i32 [[TMP26]], 2, 0, 3
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_0_4_INSERT335:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_3_INSERT334]], i32 [[TMP28]], 2, 0, 4
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_1_INSERT336:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_4_INSERT335]], <2 x float> [[TMP30]], 2, 1
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_2_INSERT337:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_1_INSERT336]], i32 [[TMP32]], 2, 2
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_3_INSERT338:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_2_INSERT337]], i32 [[TMP34]], 2, 3
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_4_INSERT339:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_3_INSERT338]], i32 [[TMP36]], 2, 4
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_5_INSERT340:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_4_INSERT339]], i32 [[TMP38]], 2, 5
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_6_INSERT341:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_5_INSERT340]], i32 [[TMP40]], 2, 6
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_7_INSERT342:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_6_INSERT341]], i32 [[TMP42]], 2, 7
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_8_INSERT343:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_7_INSERT342]], i64 [[TMP44]], 2, 8
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP108:%.*]] = load [8 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-ATTRSIZE-16-NEXT:    call void (...) @lgc.cps.jump(i32 [[DOTSROA_0150_0_VEC_EXTRACT]], i32 -1, i32 poison, i32 [[TMP106]], i32 [[TMP83]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_8_INSERT343]], [7 x i32] poison, [8 x i32] [[TMP108]]), !continuation.registercount [[META0]]
-; CHECK-ATTRSIZE-16-NEXT:    unreachable
-; CHECK-ATTRSIZE-16:       108:
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTSROA_7_0:%.*]] = phi i32 [ [[TMP4]], [[DOTEXIT2]] ], [ [[TMP83]], [[DOTEXIT5]] ]
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTSROA_0373_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[TMP44]] to i32
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { <3 x i32>, i32 } poison, <3 x i32> [[TMP2]], 0
-; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { <3 x i32>, i32 } [[DOTFCA_0_INSERT]], i32 [[DOTSROA_7_0]], 1
-; CHECK-ATTRSIZE-16-NEXT:    [[TMP110:%.*]] = load [8 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-ATTRSIZE-16-NEXT:    call void (...) @lgc.cps.jump(i32 [[DOTSROA_0373_0_EXTRACT_TRUNC]], i32 -1, i32 poison, i32 poison, i32 [[DOTSROA_7_0]], { <3 x i32>, i32 } [[DOTFCA_1_INSERT]], [34 x i32] poison, [8 x i32] [[TMP110]]), !continuation.registercount [[META0]]
-; CHECK-ATTRSIZE-16-NEXT:    unreachable
-;
-; CHECK-ATTRSIZE-8-LABEL: define dso_local spir_func void @_cont_Traversal(
-; CHECK-ATTRSIZE-8-SAME: i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [5 x i32] [[PADDING:%.*]], [8 x i32] [[PAYLOAD:%.*]]) local_unnamed_addr !lgc.shaderstage [[META4:![0-9]+]] !lgc.rt.shaderstage [[META5:![0-9]+]] !continuation.registercount [[META0:![0-9]+]] !lgc.cps [[META6:![0-9]+]] !continuation [[META7:![0-9]+]] {
-; CHECK-ATTRSIZE-8-NEXT:  .entry:
-; CHECK-ATTRSIZE-8-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, align 16, addrspace(5)
-; CHECK-ATTRSIZE-8-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [8 x i32], align 4
-; CHECK-ATTRSIZE-8-NEXT:    store [8 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-ATTRSIZE-8-NEXT:    store { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[SYSTEM_DATA]], ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], align 16
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 93, i32 17, i32 0, i32 0)
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP1:%.*]] = call ptr @llvm.invariant.start.p7(i64 -1, ptr addrspace(7) [[TMP0]])
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP2:%.*]] = load <3 x i32>, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], align 16
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP3:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 1
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(5) [[TMP3]], align 4
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP5:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 0
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP6:%.*]] = load i64, ptr addrspace(5) [[TMP5]], align 8
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP7:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 1
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP9:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 2
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(5) [[TMP9]], align 4
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFR539:%.*]] = freeze i32 [[TMP10]]
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP11:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP12:%.*]] = load <3 x float>, ptr addrspace(5) [[TMP11]], align 16
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP13:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 4
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP14:%.*]] = load <3 x float>, ptr addrspace(5) [[TMP13]], align 16
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP15:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 5
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP16:%.*]] = load float, ptr addrspace(5) [[TMP15]], align 4
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP17:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 6
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP18:%.*]] = load float, ptr addrspace(5) [[TMP17]], align 4
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP19:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 0, i32 0
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP20:%.*]] = load float, ptr addrspace(5) [[TMP19]], align 4
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP21:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 0, i32 1
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP22:%.*]] = load i32, ptr addrspace(5) [[TMP21]], align 4
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFR:%.*]] = freeze i32 [[TMP22]]
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP23:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 0, i32 2
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP24:%.*]] = load i32, ptr addrspace(5) [[TMP23]], align 4
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP25:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 0, i32 3
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(5) [[TMP25]], align 4
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP27:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 0, i32 4
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP28:%.*]] = load i32, ptr addrspace(5) [[TMP27]], align 4
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP29:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 1
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP30:%.*]] = load <2 x float>, ptr addrspace(5) [[TMP29]], align 8
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP31:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 2
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP32:%.*]] = load i32, ptr addrspace(5) [[TMP31]], align 4
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP33:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 3
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP34:%.*]] = load i32, ptr addrspace(5) [[TMP33]], align 4
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP35:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 4
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP36:%.*]] = load i32, ptr addrspace(5) [[TMP35]], align 4
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP37:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 5
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP38:%.*]] = load i32, ptr addrspace(5) [[TMP37]], align 4
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP39:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 6
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP40:%.*]] = load i32, ptr addrspace(5) [[TMP39]], align 4
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP41:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 7
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP42:%.*]] = load i32, ptr addrspace(5) [[TMP41]], align 4
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP43:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 8
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP44:%.*]] = load i64, ptr addrspace(5) [[TMP43]], align 8
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP45:%.*]] = icmp ugt i32 [[DOTFR]], -3
-; CHECK-ATTRSIZE-8-NEXT:    br i1 [[TMP45]], label [[TMP46:%.*]], label [[TMP67:%.*]]
-; CHECK-ATTRSIZE-8:       46:
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP47:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 5
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP48:%.*]] = load i32, ptr addrspace(7) [[TMP47]], align 4
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP49:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 6
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP50:%.*]] = load i32, ptr addrspace(7) [[TMP49]], align 4
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP51:%.*]] = zext i32 [[TMP50]] to i64
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP52:%.*]] = shl nuw i64 [[TMP51]], 32
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP53:%.*]] = zext i32 [[TMP48]] to i64
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP54:%.*]] = or i64 [[TMP52]], [[TMP53]]
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFR541:%.*]] = freeze i64 [[TMP54]]
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP55:%.*]] = icmp eq i64 [[DOTFR541]], 0
-; CHECK-ATTRSIZE-8-NEXT:    br i1 [[TMP55]], label [[DOTEXIT2:%.*]], label [[TMP56:%.*]]
-; CHECK-ATTRSIZE-8:       56:
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP57:%.*]] = lshr i32 [[TMP8]], 16
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP58:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 7
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP59:%.*]] = load i32, ptr addrspace(7) [[TMP58]], align 4
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP60:%.*]] = mul i32 [[TMP59]], [[TMP57]]
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP61:%.*]] = zext i32 [[TMP60]] to i64
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP62:%.*]] = add i64 [[DOTFR541]], [[TMP61]]
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP63:%.*]] = inttoptr i64 [[TMP62]] to ptr addrspace(4)
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP64:%.*]] = load i32, ptr addrspace(4) [[TMP63]], align 4
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP65:%.*]] = freeze i32 [[TMP64]]
-; CHECK-ATTRSIZE-8-NEXT:    br label [[DOTEXIT2]]
-; CHECK-ATTRSIZE-8:       .exit2:
-; CHECK-ATTRSIZE-8-NEXT:    [[DOT0:%.*]] = phi i32 [ [[TMP57]], [[TMP56]] ], [ undef, [[TMP46]] ]
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTSROA_0128_0_EXTRACT_TRUNC:%.*]] = phi i32 [ [[TMP65]], [[TMP56]] ], [ 0, [[TMP46]] ]
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTNOT542:%.*]] = icmp eq i32 [[DOTSROA_0128_0_EXTRACT_TRUNC]], 0
-; CHECK-ATTRSIZE-8-NEXT:    br i1 [[DOTNOT542]], label [[TMP107:%.*]], label [[TMP66:%.*]]
-; CHECK-ATTRSIZE-8:       66:
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTSROA_0130_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[TMP44]] to i32
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } poison, <3 x i32> [[TMP2]], 0, 0
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_0_0_INSERT]], i32 [[DOT0]], 0, 1
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_1_0_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_0_1_INSERT]], i64 [[TMP6]], 1, 0
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_1_1_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_0_INSERT]], i32 [[TMP8]], 1, 1
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_1_2_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_1_INSERT]], i32 [[DOTFR539]], 1, 2
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_1_3_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_2_INSERT]], <3 x float> [[TMP12]], 1, 3
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_1_4_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_3_INSERT]], <3 x float> [[TMP14]], 1, 4
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_1_5_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_4_INSERT]], float [[TMP16]], 1, 5
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_1_6_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_5_INSERT]], float [[TMP18]], 1, 6
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_0_0_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_6_INSERT]], float [[TMP20]], 2, 0, 0
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_0_1_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_0_INSERT]], i32 [[DOTFR]], 2, 0, 1
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_0_2_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_1_INSERT]], i32 [[TMP24]], 2, 0, 2
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_0_3_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_2_INSERT]], i32 [[TMP26]], 2, 0, 3
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_0_4_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_3_INSERT]], i32 [[TMP28]], 2, 0, 4
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_1_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_4_INSERT]], <2 x float> [[TMP30]], 2, 1
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_2_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_1_INSERT]], i32 [[TMP32]], 2, 2
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_3_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_2_INSERT]], i32 [[TMP34]], 2, 3
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_4_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_3_INSERT]], i32 [[TMP36]], 2, 4
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_5_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_4_INSERT]], i32 [[TMP38]], 2, 5
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_6_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_5_INSERT]], i32 [[TMP40]], 2, 6
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_7_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_6_INSERT]], i32 [[TMP42]], 2, 7
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_8_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_7_INSERT]], i64 [[TMP44]], 2, 8
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP109:%.*]] = load [8 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-ATTRSIZE-8-NEXT:    call void (...) @lgc.cps.jump(i32 [[DOTSROA_0128_0_EXTRACT_TRUNC]], i32 -1, i32 [[DOTSROA_0130_0_EXTRACT_TRUNC]], i32 poison, i32 [[DOT0]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_8_INSERT]], [5 x i32] poison, [8 x i32] [[TMP109]]), !continuation.registercount [[META0]]
-; CHECK-ATTRSIZE-8-NEXT:    unreachable
-; CHECK-ATTRSIZE-8:       68:
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP68:%.*]] = shl i32 [[DOTFR]], 3
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP69:%.*]] = and i32 [[TMP68]], -64
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP70:%.*]] = zext i32 [[TMP69]] to i64
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP71:%.*]] = add i64 [[TMP6]], [[TMP70]]
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP72:%.*]] = add i64 [[TMP71]], 48
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP73:%.*]] = inttoptr i64 [[TMP72]] to ptr addrspace(1)
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP74:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP73]], align 16
-; CHECK-ATTRSIZE-8-NEXT:    [[DOT4_VEC_EXTRACT452:%.*]] = extractelement <4 x i32> [[TMP74]], i64 1
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP75:%.*]] = and i32 [[TMP26]], 16777215
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP76:%.*]] = and i32 [[DOT4_VEC_EXTRACT452]], 16777215
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP77:%.*]] = lshr i32 [[TMP8]], 8
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP78:%.*]] = and i32 [[TMP77]], 15
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP79:%.*]] = lshr i32 [[TMP8]], 12
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP80:%.*]] = and i32 [[TMP79]], 15
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP81:%.*]] = mul nuw nsw i32 [[TMP80]], [[TMP75]]
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP82:%.*]] = add nuw nsw i32 [[TMP78]], [[TMP81]]
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP83:%.*]] = add nuw nsw i32 [[TMP82]], [[TMP76]]
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP84:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 9
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP85:%.*]] = load i32, ptr addrspace(7) [[TMP84]], align 4
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP86:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 10
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP87:%.*]] = load i32, ptr addrspace(7) [[TMP86]], align 4
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP88:%.*]] = zext i32 [[TMP87]] to i64
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP89:%.*]] = shl nuw i64 [[TMP88]], 32
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP90:%.*]] = zext i32 [[TMP85]] to i64
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP91:%.*]] = or i64 [[TMP89]], [[TMP90]]
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFR537:%.*]] = freeze i64 [[TMP91]]
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DOTFR537]], 0
-; CHECK-ATTRSIZE-8-NEXT:    br i1 [[DOTNOT]], label [[DOTEXIT5:%.*]], label [[TMP92:%.*]]
-; CHECK-ATTRSIZE-8:       93:
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP93:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 11
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP94:%.*]] = load i32, ptr addrspace(7) [[TMP93]], align 4
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP95:%.*]] = mul i32 [[TMP94]], [[TMP83]]
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP96:%.*]] = zext i32 [[TMP95]] to i64
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP97:%.*]] = add i64 [[DOTFR537]], [[TMP96]]
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP98:%.*]] = inttoptr i64 [[TMP97]] to ptr addrspace(1)
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP99:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP98]], align 16
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP100:%.*]] = shufflevector <4 x i32> [[TMP99]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP101:%.*]] = freeze <2 x i32> [[TMP100]]
-; CHECK-ATTRSIZE-8-NEXT:    br label [[DOTEXIT5]]
-; CHECK-ATTRSIZE-8:       .exit5:
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTSROA_0501_0:%.*]] = phi <2 x i32> [ [[TMP101]], [[TMP92]] ], [ zeroinitializer, [[TMP67]] ]
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP102:%.*]] = and i32 [[DOTFR539]], 8
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP103:%.*]] = icmp ne i32 [[TMP102]], 0
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTSROA_0150_0_VEC_EXTRACT:%.*]] = extractelement <2 x i32> [[DOTSROA_0501_0]], i64 0
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTNOT540:%.*]] = icmp eq i32 [[DOTSROA_0150_0_VEC_EXTRACT]], 0
-; CHECK-ATTRSIZE-8-NEXT:    [[OR_COND:%.*]] = or i1 [[TMP103]], [[DOTNOT540]]
-; CHECK-ATTRSIZE-8-NEXT:    br i1 [[OR_COND]], label [[TMP107]], label [[TMP104:%.*]]
-; CHECK-ATTRSIZE-8:       105:
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP106:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @_cont_Traversal)
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_0_0_INSERT322:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } poison, <3 x i32> [[TMP2]], 0, 0
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_0_1_INSERT323:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_0_0_INSERT322]], i32 [[TMP83]], 0, 1
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_1_0_INSERT324:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_0_1_INSERT323]], i64 [[TMP6]], 1, 0
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_1_1_INSERT325:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_0_INSERT324]], i32 [[TMP8]], 1, 1
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_1_2_INSERT326:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_1_INSERT325]], i32 [[DOTFR539]], 1, 2
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_1_3_INSERT327:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_2_INSERT326]], <3 x float> [[TMP12]], 1, 3
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_1_4_INSERT328:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_3_INSERT327]], <3 x float> [[TMP14]], 1, 4
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_1_5_INSERT329:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_4_INSERT328]], float [[TMP16]], 1, 5
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_1_6_INSERT330:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_5_INSERT329]], float [[TMP18]], 1, 6
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_0_0_INSERT331:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_6_INSERT330]], float [[TMP20]], 2, 0, 0
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_0_1_INSERT332:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_0_INSERT331]], i32 [[DOTFR]], 2, 0, 1
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_0_2_INSERT333:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_1_INSERT332]], i32 [[TMP24]], 2, 0, 2
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_0_3_INSERT334:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_2_INSERT333]], i32 [[TMP26]], 2, 0, 3
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_0_4_INSERT335:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_3_INSERT334]], i32 [[TMP28]], 2, 0, 4
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_1_INSERT336:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_4_INSERT335]], <2 x float> [[TMP30]], 2, 1
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_2_INSERT337:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_1_INSERT336]], i32 [[TMP32]], 2, 2
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_3_INSERT338:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_2_INSERT337]], i32 [[TMP34]], 2, 3
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_4_INSERT339:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_3_INSERT338]], i32 [[TMP36]], 2, 4
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_5_INSERT340:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_4_INSERT339]], i32 [[TMP38]], 2, 5
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_6_INSERT341:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_5_INSERT340]], i32 [[TMP40]], 2, 6
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_7_INSERT342:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_6_INSERT341]], i32 [[TMP42]], 2, 7
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_8_INSERT343:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_7_INSERT342]], i64 [[TMP44]], 2, 8
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP108:%.*]] = load [8 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-ATTRSIZE-8-NEXT:    call void (...) @lgc.cps.jump(i32 [[DOTSROA_0150_0_VEC_EXTRACT]], i32 -1, i32 poison, i32 [[TMP106]], i32 [[TMP83]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_8_INSERT343]], [5 x i32] poison, [8 x i32] [[TMP108]]), !continuation.registercount [[META0]]
-; CHECK-ATTRSIZE-8-NEXT:    unreachable
-; CHECK-ATTRSIZE-8:       108:
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTSROA_7_0:%.*]] = phi i32 [ [[TMP4]], [[DOTEXIT2]] ], [ [[TMP83]], [[DOTEXIT5]] ]
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTSROA_0373_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[TMP44]] to i32
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { <3 x i32>, i32 } poison, <3 x i32> [[TMP2]], 0
-; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { <3 x i32>, i32 } [[DOTFCA_0_INSERT]], i32 [[DOTSROA_7_0]], 1
-; CHECK-ATTRSIZE-8-NEXT:    [[TMP110:%.*]] = load [8 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-ATTRSIZE-8-NEXT:    call void (...) @lgc.cps.jump(i32 [[DOTSROA_0373_0_EXTRACT_TRUNC]], i32 -1, i32 poison, i32 poison, i32 [[DOTSROA_7_0]], { <3 x i32>, i32 } [[DOTFCA_1_INSERT]], [32 x i32] poison, [8 x i32] [[TMP110]]), !continuation.registercount [[META0]]
-; CHECK-ATTRSIZE-8-NEXT:    unreachable
-;
-.entry:
-  %1 = call ptr addrspace(7) @lgc.load.buffer.desc(i64 93, i32 17, i32 0, i32 0)
-  %2 = call ptr @llvm.invariant.start.p7(i64 -1, ptr addrspace(7) %1)
-  %3 = load <3 x i32>, ptr addrspace(5) %0, align 16
-  %4 = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) %0, i32 0, i32 0, i32 1
-  %5 = load i32, ptr addrspace(5) %4, align 4
-  %6 = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) %0, i32 0, i32 1, i32 0
-  %7 = load i64, ptr addrspace(5) %6, align 8
-  %8 = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) %0, i32 0, i32 1, i32 1
-  %9 = load i32, ptr addrspace(5) %8, align 4
-  %10 = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) %0, i32 0, i32 1, i32 2
-  %11 = load i32, ptr addrspace(5) %10, align 4
-  %.fr539 = freeze i32 %11
-  %12 = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) %0, i32 0, i32 1, i32 3
-  %13 = load <3 x float>, ptr addrspace(5) %12, align 16
-  %14 = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) %0, i32 0, i32 1, i32 4
-  %15 = load <3 x float>, ptr addrspace(5) %14, align 16
-  %16 = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) %0, i32 0, i32 1, i32 5
-  %17 = load float, ptr addrspace(5) %16, align 4
-  %18 = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) %0, i32 0, i32 1, i32 6
-  %19 = load float, ptr addrspace(5) %18, align 4
-  %20 = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) %0, i32 0, i32 2, i32 0, i32 0
-  %21 = load float, ptr addrspace(5) %20, align 4
-  %22 = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) %0, i32 0, i32 2, i32 0, i32 1
-  %23 = load i32, ptr addrspace(5) %22, align 4
-  %.fr = freeze i32 %23
-  %24 = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) %0, i32 0, i32 2, i32 0, i32 2
-  %25 = load i32, ptr addrspace(5) %24, align 4
-  %26 = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) %0, i32 0, i32 2, i32 0, i32 3
-  %27 = load i32, ptr addrspace(5) %26, align 4
-  %28 = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) %0, i32 0, i32 2, i32 0, i32 4
-  %29 = load i32, ptr addrspace(5) %28, align 4
-  %30 = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) %0, i32 0, i32 2, i32 1
-  %31 = load <2 x float>, ptr addrspace(5) %30, align 8
-  %32 = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) %0, i32 0, i32 2, i32 2
-  %33 = load i32, ptr addrspace(5) %32, align 4
-  %34 = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) %0, i32 0, i32 2, i32 3
-  %35 = load i32, ptr addrspace(5) %34, align 4
-  %36 = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) %0, i32 0, i32 2, i32 4
-  %37 = load i32, ptr addrspace(5) %36, align 4
-  %38 = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) %0, i32 0, i32 2, i32 5
-  %39 = load i32, ptr addrspace(5) %38, align 4
-  %40 = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) %0, i32 0, i32 2, i32 6
-  %41 = load i32, ptr addrspace(5) %40, align 4
-  %42 = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) %0, i32 0, i32 2, i32 7
-  %43 = load i32, ptr addrspace(5) %42, align 4
-  %44 = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) %0, i32 0, i32 2, i32 8
-  %45 = load i64, ptr addrspace(5) %44, align 8
-  %46 = icmp ugt i32 %.fr, -3
-  br i1 %46, label %47, label %68
-
-47:                                               ; preds = %.entry
-  %48 = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) %1, i32 0, i32 5
-  %49 = load i32, ptr addrspace(7) %48, align 4
-  %50 = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) %1, i32 0, i32 6
-  %51 = load i32, ptr addrspace(7) %50, align 4
-  %52 = zext i32 %51 to i64
-  %53 = shl nuw i64 %52, 32
-  %54 = zext i32 %49 to i64
-  %55 = or i64 %53, %54
-  %.fr541 = freeze i64 %55
-  %56 = icmp eq i64 %.fr541, 0
-  br i1 %56, label %.exit2, label %57
-
-57:                                               ; preds = %47
-  %58 = lshr i32 %9, 16
-  %59 = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) %1, i32 0, i32 7
-  %60 = load i32, ptr addrspace(7) %59, align 4
-  %61 = mul i32 %60, %58
-  %62 = zext i32 %61 to i64
-  %63 = add i64 %.fr541, %62
-  %64 = inttoptr i64 %63 to ptr addrspace(4)
-  %65 = load i32, ptr addrspace(4) %64, align 4
-  %66 = freeze i32 %65
-  br label %.exit2
-
-.exit2:                                           ; preds = %47, %57
-  %.0 = phi i32 [ %58, %57 ], [ undef, %47 ]
-  %.sroa.0128.0.extract.trunc = phi i32 [ %66, %57 ], [ 0, %47 ]
-  %.not542 = icmp eq i32 %.sroa.0128.0.extract.trunc, 0
-  br i1 %.not542, label %106, label %67
-
-67:                                               ; preds = %.exit2
-  %.sroa.0130.0.extract.trunc = trunc i64 %45 to i32
-  %.fca.0.0.insert = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } poison, <3 x i32> %3, 0, 0
-  %.fca.0.1.insert = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.0.0.insert, i32 %.0, 0, 1
-  %.fca.1.0.insert = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.0.1.insert, i64 %7, 1, 0
-  %.fca.1.1.insert = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.1.0.insert, i32 %9, 1, 1
-  %.fca.1.2.insert = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.1.1.insert, i32 %.fr539, 1, 2
-  %.fca.1.3.insert = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.1.2.insert, <3 x float> %13, 1, 3
-  %.fca.1.4.insert = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.1.3.insert, <3 x float> %15, 1, 4
-  %.fca.1.5.insert = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.1.4.insert, float %17, 1, 5
-  %.fca.1.6.insert = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.1.5.insert, float %19, 1, 6
-  %.fca.2.0.0.insert = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.1.6.insert, float %21, 2, 0, 0
-  %.fca.2.0.1.insert = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.0.0.insert, i32 %.fr, 2, 0, 1
-  %.fca.2.0.2.insert = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.0.1.insert, i32 %25, 2, 0, 2
-  %.fca.2.0.3.insert = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.0.2.insert, i32 %27, 2, 0, 3
-  %.fca.2.0.4.insert = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.0.3.insert, i32 %29, 2, 0, 4
-  %.fca.2.1.insert = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.0.4.insert, <2 x float> %31, 2, 1
-  %.fca.2.2.insert = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.1.insert, i32 %33, 2, 2
-  %.fca.2.3.insert = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.2.insert, i32 %35, 2, 3
-  %.fca.2.4.insert = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.3.insert, i32 %37, 2, 4
-  %.fca.2.5.insert = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.4.insert, i32 %39, 2, 5
-  %.fca.2.6.insert = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.5.insert, i32 %41, 2, 6
-  %.fca.2.7.insert = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.6.insert, i32 %43, 2, 7
-  %.fca.2.8.insert = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.7.insert, i64 %45, 2, 8
-  call void (...) @lgc.cps.jump(i32 %.sroa.0128.0.extract.trunc, i32 -1, i32 %.sroa.0130.0.extract.trunc, i32 poison, i32 %.0, { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.8.insert)
-  unreachable
-
-68:                                               ; preds = %.entry
-  %69 = shl i32 %.fr, 3
-  %70 = and i32 %69, -64
-  %71 = zext i32 %70 to i64
-  %72 = add i64 %7, %71
-  %73 = add i64 %72, 48
-  %74 = inttoptr i64 %73 to ptr addrspace(1)
-  %75 = load <4 x i32>, ptr addrspace(1) %74, align 16
-  %.4.vec.extract452 = extractelement <4 x i32> %75, i64 1
-  %76 = and i32 %27, 16777215
-  %77 = and i32 %.4.vec.extract452, 16777215
-  %78 = lshr i32 %9, 8
-  %79 = and i32 %78, 15
-  %80 = lshr i32 %9, 12
-  %81 = and i32 %80, 15
-  %82 = mul nuw nsw i32 %81, %76
-  %83 = add nuw nsw i32 %79, %82
-  %84 = add nuw nsw i32 %83, %77
-  %85 = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) %1, i32 0, i32 9
-  %86 = load i32, ptr addrspace(7) %85, align 4
-  %87 = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) %1, i32 0, i32 10
-  %88 = load i32, ptr addrspace(7) %87, align 4
-  %89 = zext i32 %88 to i64
-  %90 = shl nuw i64 %89, 32
-  %91 = zext i32 %86 to i64
-  %92 = or i64 %90, %91
-  %.fr537 = freeze i64 %92
-  %.not = icmp eq i64 %.fr537, 0
-  br i1 %.not, label %.exit5, label %93
-
-93:                                               ; preds = %68
-  %94 = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) %1, i32 0, i32 11
-  %95 = load i32, ptr addrspace(7) %94, align 4
-  %96 = mul i32 %95, %84
-  %97 = zext i32 %96 to i64
-  %98 = add i64 %.fr537, %97
-  %99 = inttoptr i64 %98 to ptr addrspace(1)
-  %100 = load <4 x i32>, ptr addrspace(1) %99, align 16
-  %101 = shufflevector <4 x i32> %100, <4 x i32> poison, <2 x i32> <i32 0, i32 1>
-  %102 = freeze <2 x i32> %101
-  br label %.exit5
-
-.exit5:                                           ; preds = %93, %68
-  %.sroa.0501.0 = phi <2 x i32> [ %102, %93 ], [ zeroinitializer, %68 ]
-  %103 = and i32 %.fr539, 8
-  %104 = icmp ne i32 %103, 0
-  %.sroa.0150.0.vec.extract = extractelement <2 x i32> %.sroa.0501.0, i64 0
-  %.not540 = icmp eq i32 %.sroa.0150.0.vec.extract, 0
-  %or.cond = or i1 %104, %.not540
-  br i1 %or.cond, label %106, label %105
-
-105:                                              ; preds = %.exit5
-  %addr = call i32 @_AmdGetCurrentFuncAddr()
-  %.fca.0.0.insert322 = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } poison, <3 x i32> %3, 0, 0
-  %.fca.0.1.insert323 = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.0.0.insert322, i32 %84, 0, 1
-  %.fca.1.0.insert324 = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.0.1.insert323, i64 %7, 1, 0
-  %.fca.1.1.insert325 = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.1.0.insert324, i32 %9, 1, 1
-  %.fca.1.2.insert326 = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.1.1.insert325, i32 %.fr539, 1, 2
-  %.fca.1.3.insert327 = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.1.2.insert326, <3 x float> %13, 1, 3
-  %.fca.1.4.insert328 = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.1.3.insert327, <3 x float> %15, 1, 4
-  %.fca.1.5.insert329 = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.1.4.insert328, float %17, 1, 5
-  %.fca.1.6.insert330 = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.1.5.insert329, float %19, 1, 6
-  %.fca.2.0.0.insert331 = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.1.6.insert330, float %21, 2, 0, 0
-  %.fca.2.0.1.insert332 = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.0.0.insert331, i32 %.fr, 2, 0, 1
-  %.fca.2.0.2.insert333 = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.0.1.insert332, i32 %25, 2, 0, 2
-  %.fca.2.0.3.insert334 = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.0.2.insert333, i32 %27, 2, 0, 3
-  %.fca.2.0.4.insert335 = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.0.3.insert334, i32 %29, 2, 0, 4
-  %.fca.2.1.insert336 = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.0.4.insert335, <2 x float> %31, 2, 1
-  %.fca.2.2.insert337 = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.1.insert336, i32 %33, 2, 2
-  %.fca.2.3.insert338 = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.2.insert337, i32 %35, 2, 3
-  %.fca.2.4.insert339 = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.3.insert338, i32 %37, 2, 4
-  %.fca.2.5.insert340 = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.4.insert339, i32 %39, 2, 5
-  %.fca.2.6.insert341 = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.5.insert340, i32 %41, 2, 6
-  %.fca.2.7.insert342 = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.6.insert341, i32 %43, 2, 7
-  %.fca.2.8.insert343 = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.7.insert342, i64 %45, 2, 8
-  call void (...) @lgc.cps.jump(i32 %.sroa.0150.0.vec.extract, i32 -1, i32 poison, i32 %addr, i32 %84, { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.8.insert343)
-  unreachable
-
-106:                                              ; preds = %.exit5, %.exit2
-  %.sroa.7.0 = phi i32 [ %5, %.exit2 ], [ %84, %.exit5 ]
-  %.sroa.0373.0.extract.trunc = trunc i64 %45 to i32
-  %.fca.0.insert = insertvalue { <3 x i32>, i32 } poison, <3 x i32> %3, 0
-  %.fca.1.insert = insertvalue { <3 x i32>, i32 } %.fca.0.insert, i32 %.sroa.7.0, 1
-  call void (...) @lgc.cps.jump(i32 %.sroa.0373.0.extract.trunc, i32 -1, i32 poison, i32 poison, i32 %.sroa.7.0, { <3 x i32>, i32 } %.fca.1.insert)
-  unreachable
-}
-
-declare void @lgc.cps.jump(...) local_unnamed_addr
-declare ptr addrspace(7) @lgc.load.buffer.desc(i64 %0, i32 %1, i32 %2, i32 %3) local_unnamed_addr
-declare ptr @llvm.invariant.start.p7(i64 immarg %0, ptr addrspace(7) nocapture %1)
-
-!continuation.maxUsedPayloadRegisterCount = !{!7}
-!lgc.cps.module = !{}
-!lgc.rt.max.attribute.size = !{!4}
-
-!0 = !{i32 7}
-!1 = !{ { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } poison}
-!2 = !{i32 5, { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } poison}
-!3 = !{i32 6}
-!4 = !{i32 16} ; HITATTR_SIZE_16
-!4 = !{i32 8}  ; HITATTR_SIZE_8
-!5 = !{i32 0, %struct.AnyHitTraversalData poison}
-!6 = !{ %struct.AnyHitTraversalData poison}
-!7 = !{i32 8}
-!9 = !{i32 0, %struct.DispatchSystemData poison}
-!10 = !{%struct.DispatchSystemData poison}
diff --git a/llvmraytracing/test/lgccps/multiple-await.ll b/llvmraytracing/test/lgccps/multiple-await.ll
index 31a73d0967..2e6de52440 100644
--- a/llvmraytracing/test/lgccps/multiple-await.ll
+++ b/llvmraytracing/test/lgccps/multiple-await.ll
@@ -6,17 +6,17 @@
 declare !lgc.cps !0 void @callee({}, i32, float)
 declare !lgc.cps !0 void @callee2({}, i32, float)
 
-define void @test({} %state, i32 %rcr, float %arg, float %arg2) !lgc.cps !0 {
+define void @test(i32 %shaderIndex, i32 %rcr, float %arg, float %arg2) !lgc.cps !0 {
   %t0 = fadd float %arg, 1.0
   %cr = call i32 @lgc.cps.as.continuation.reference(ptr @callee)
-  %t1 = call { float } (...) @lgc.cps.await__f32(i32 %cr, i32 2, float %t0), !continuation.returnedRegistercount !{i32 0}
-  %res = extractvalue { float } %t1, 0
+  %t1 = call { i32, float } (...) @lgc.cps.await__sl_i32f32(i32 %cr, i32 2, i32 poison, float %t0), !continuation.returnedRegistercount !{i32 0}
+  %res = extractvalue { i32, float } %t1, 1
   %t2 = fmul float %res, %arg
   %cr2 = call i32 @lgc.cps.as.continuation.reference(ptr @callee2)
-  %t3 = call { float } (...) @lgc.cps.await__f32(i32 %cr2, i32 2, float %t2), !continuation.returnedRegistercount !{i32 0}
-  %res.2 = extractvalue { float } %t3, 0
-  %returnvalue = fadd float %res.2, %arg2
-  call void (...) @lgc.cps.jump(i32 %rcr, i32 2,  i32 poison, i32 poison, float %returnvalue)
+  %t3 = call { i32, float } (...) @lgc.cps.await__sl_i32f32(i32 %cr2, i32 2, i32 poison, float %t2), !continuation.returnedRegistercount !{i32 0}
+  %t3.ret = extractvalue { i32, float } %t3, 1
+  %returnvalue = fadd float %t3.ret, %arg2
+  call void (...) @lgc.cps.jump(i32 %rcr, i32 2, i32 poison, i32 poison, i32 poison, float %returnvalue)
   unreachable
 }
 
@@ -26,79 +26,81 @@ define void @test({} %state, i32 %rcr, float %arg, float %arg2) !lgc.cps !0 {
 !1 = !{i32 5}
 
 declare i32 @lgc.cps.as.continuation.reference(...) memory(none)
-declare { float } @lgc.cps.await__f32(...)
+declare { i32, float } @lgc.cps.await__sl_i32f32(...)
 declare void @lgc.cps.jump(...)
 ; CHECK-LABEL: define void @test(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], {} [[STATE:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]], float [[ARG2:%.*]]) !lgc.cps [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] !continuation.stacksize [[META3:![0-9]+]] !continuation.state [[META3]] {
+; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[SHADERINDEX:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]], float [[ARG2:%.*]]) !lgc.cps [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] !continuation.stacksize [[META3:![0-9]+]] !continuation.state [[META3]] {
 ; CHECK-NEXT:  AllocaSpillBB:
 ; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 12
 ; CHECK-NEXT:    store i32 [[TMP3]], ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP2]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP2]], 8
-; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP5]], i32 0
-; CHECK-NEXT:    store float [[ARG2]], ptr addrspace(5) [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP2]], 4
+; CHECK-NEXT:    store i32 [[RCR]], ptr addrspace(5) [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i32 [[TMP7]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP8]], i32 0
 ; CHECK-NEXT:    store float [[ARG]], ptr addrspace(5) [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP10]], i32 0
-; CHECK-NEXT:    store i32 [[RCR]], ptr addrspace(5) [[TMP11]], align 4
+; CHECK-NEXT:    store float [[ARG2]], ptr addrspace(5) [[TMP11]], align 4
 ; CHECK-NEXT:    [[T0:%.*]] = fadd float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @callee)
 ; CHECK-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[CR]] to ptr
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @test.resume.0)
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP12]], i32 [[TMP1]], float [[T0]]), !continuation.returnedRegistercount [[META4:![0-9]+]]
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP12]], i32 poison, i32 [[TMP1]], float [[T0]]), !continuation.returnedRegistercount [[META4:![0-9]+]]
 ; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK-LABEL: define dso_local void @test.resume.0(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], float [[TMP2:%.*]]) !lgc.cps [[META1]] !continuation [[META2]] !continuation.registercount [[META4]] {
+; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]], float [[TMP1:%.*]]) !lgc.cps [[META1]] !continuation [[META2]] !continuation.registercount [[META4]] {
 ; CHECK-NEXT:  entryresume.0:
 ; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], -12
-; CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { float } poison, float [[TMP2]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { i32, float } poison, i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { i32, float } [[TMP11]], float [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP4]], 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP8]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP9]], i32 0
 ; CHECK-NEXT:    [[ARG_RELOAD:%.*]] = load float, ptr addrspace(5) [[TMP7]], align 4
-; CHECK-NEXT:    [[RES1:%.*]] = extractvalue { float } [[TMP11]], 0
+; CHECK-NEXT:    [[RES1:%.*]] = extractvalue { i32, float } [[TMP12]], 1
 ; CHECK-NEXT:    [[T2:%.*]] = fmul float [[RES1]], [[ARG_RELOAD]]
 ; CHECK-NEXT:    [[CR2:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @callee2)
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[CR2]] to ptr
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @test.resume.1)
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR2]], i32 2, i32 [[TMP10]], i32 [[TMP6]], float [[T2]]), !continuation.returnedRegistercount [[META4]]
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR2]], i32 2, i32 [[TMP10]], i32 poison, i32 [[TMP6]], float [[T2]]), !continuation.returnedRegistercount [[META4]]
 ; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK-LABEL: define dso_local void @test.resume.1(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], float [[TMP2:%.*]]) !lgc.cps [[META1]] !continuation [[META2]] !continuation.registercount [[META4]] {
+; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]], float [[TMP1:%.*]]) !lgc.cps [[META1]] !continuation [[META2]] !continuation.registercount [[META4]] {
 ; CHECK-NEXT:  entryresume.1:
 ; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], -12
-; CHECK-NEXT:    [[TMP13:%.*]] = insertvalue { float } poison, float [[TMP2]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = insertvalue { i32, float } poison, i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue { i32, float } [[TMP13]], float [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 8
-; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i32 [[TMP5]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP6]], i32 0
-; CHECK-NEXT:    [[ARG2_RELOAD:%.*]] = load float, ptr addrspace(5) [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(5)
+; CHECK-NEXT:    [[RCR_RELOAD:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i32 [[TMP5]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP8]], i32 0
-; CHECK-NEXT:    [[RCR_RELOAD:%.*]] = load i32, ptr addrspace(5) [[TMP9]], align 4
-; CHECK-NEXT:    [[RES_21:%.*]] = extractvalue { float } [[TMP13]], 0
-; CHECK-NEXT:    [[RETURNVALUE:%.*]] = fadd float [[RES_21]], [[ARG2_RELOAD]]
+; CHECK-NEXT:    [[ARG2_RELOAD:%.*]] = load float, ptr addrspace(5) [[TMP9]], align 4
+; CHECK-NEXT:    [[T3_RET1:%.*]] = extractvalue { i32, float } [[TMP14]], 1
+; CHECK-NEXT:    [[RETURNVALUE:%.*]] = fadd float [[T3_RET1]], [[ARG2_RELOAD]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], -12
 ; CHECK-NEXT:    store i32 [[TMP11]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, i32 [[TMP12]], i32 poison, float [[RETURNVALUE]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, i32 [[TMP12]], i32 poison, i32 poison, float [[RETURNVALUE]])
 ; CHECK-NEXT:    unreachable
 ;
diff --git a/llvmraytracing/test/lgccps/simple-await-more-state.ll b/llvmraytracing/test/lgccps/simple-await-more-state.ll
index ee0fb87ca0..90dfeb6b00 100644
--- a/llvmraytracing/test/lgccps/simple-await-more-state.ll
+++ b/llvmraytracing/test/lgccps/simple-await-more-state.ll
@@ -5,14 +5,14 @@
 
 declare !lgc.cps !0 void @callee({}, i32, float)
 
-define void @test({} %state, i32 %rcr, float %arg, float %arg2) !lgc.cps !0 {
+define void @test(i32 %shaderIndex, i32 %rcr, float %arg, float %arg2) !lgc.cps !0 {
   %t0 = fadd float %arg, 1.0
   %cr = call i32 @lgc.cps.as.continuation.reference(ptr @callee)
-  %t1 = call { float } (...) @lgc.cps.await__f32(i32 %cr, i32 2, float %t0), !continuation.returnedRegistercount !{i32 0}
-  %res = extractvalue { float } %t1, 0
+  %t1 = call { i32, float } (...) @lgc.cps.await__sl_i32f32(i32 %cr, i32 2, i32 poison, float %t0), !continuation.returnedRegistercount !{i32 0}
+  %res = extractvalue { i32, float } %t1, 1
   %t2 = fmul float %res, %arg
   %returnvalue = fadd float %t2, %arg2
-  call void (...) @lgc.cps.jump(i32 %rcr, i32 2,  i32 poison, i32 poison, float %returnvalue)
+  call void (...) @lgc.cps.jump(i32 %rcr, i32 2, i32 poison, i32 poison, i32 poison, float %returnvalue)
   unreachable
 }
 
@@ -22,62 +22,63 @@ define void @test({} %state, i32 %rcr, float %arg, float %arg2) !lgc.cps !0 {
 !1 = !{i32 5}
 
 declare i32 @lgc.cps.as.continuation.reference(...) memory(none)
-declare { float } @lgc.cps.await__f32(...)
+declare { i32, float } @lgc.cps.await__sl_i32f32(...)
 declare void @lgc.cps.jump(...)
 ; CHECK-LABEL: define void @test(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], {} [[STATE:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]], float [[ARG2:%.*]]) !lgc.cps [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] !continuation.stacksize [[META3:![0-9]+]] !continuation.state [[META3]] {
+; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[SHADERINDEX:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]], float [[ARG2:%.*]]) !lgc.cps [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] !continuation.stacksize [[META3:![0-9]+]] !continuation.state [[META3]] {
 ; CHECK-NEXT:  AllocaSpillBB:
 ; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 12
 ; CHECK-NEXT:    store i32 [[TMP3]], ptr [[CSP]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP2]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP2]], 8
-; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP5]], i32 0
-; CHECK-NEXT:    store float [[ARG2]], ptr addrspace(5) [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP2]], 4
+; CHECK-NEXT:    store i32 [[RCR]], ptr addrspace(5) [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i32 [[TMP7]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP8]], i32 0
 ; CHECK-NEXT:    store float [[ARG]], ptr addrspace(5) [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP10]], i32 0
-; CHECK-NEXT:    store i32 [[RCR]], ptr addrspace(5) [[TMP11]], align 4
+; CHECK-NEXT:    store float [[ARG2]], ptr addrspace(5) [[TMP11]], align 4
 ; CHECK-NEXT:    [[T0:%.*]] = fadd float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @callee)
 ; CHECK-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[CR]] to ptr
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @test.resume.0)
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP12]], i32 [[TMP1]], float [[T0]]), !continuation.returnedRegistercount [[META4:![0-9]+]]
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP12]], i32 poison, i32 [[TMP1]], float [[T0]]), !continuation.returnedRegistercount [[META4:![0-9]+]]
 ; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK-LABEL: define dso_local void @test.resume.0(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], float [[TMP2:%.*]]) !lgc.cps [[META1]] !continuation [[META2]] !continuation.registercount [[META4]] {
+; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]], float [[TMP1:%.*]]) !lgc.cps [[META1]] !continuation [[META2]] !continuation.registercount [[META4]] {
 ; CHECK-NEXT:  entryresume.0:
 ; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], -12
-; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue { float } poison, float [[TMP2]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue { i32, float } poison, i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = insertvalue { i32, float } [[TMP16]], float [[TMP1]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP4]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 8
-; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i32 [[TMP5]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP6]], i32 0
-; CHECK-NEXT:    [[ARG2_RELOAD:%.*]] = load float, ptr addrspace(5) [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP4]], 4
+; CHECK-NEXT:    [[RCR_RELOAD:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP8]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP9]], i32 0
 ; CHECK-NEXT:    [[ARG_RELOAD:%.*]] = load float, ptr addrspace(5) [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP5]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP11]], i32 0
-; CHECK-NEXT:    [[RCR_RELOAD:%.*]] = load i32, ptr addrspace(5) [[TMP12]], align 4
-; CHECK-NEXT:    [[RES1:%.*]] = extractvalue { float } [[TMP16]], 0
+; CHECK-NEXT:    [[ARG2_RELOAD:%.*]] = load float, ptr addrspace(5) [[TMP12]], align 4
+; CHECK-NEXT:    [[RES1:%.*]] = extractvalue { i32, float } [[TMP17]], 1
 ; CHECK-NEXT:    [[T2:%.*]] = fmul float [[RES1]], [[ARG_RELOAD]]
 ; CHECK-NEXT:    [[RETURNVALUE:%.*]] = fadd float [[T2]], [[ARG2_RELOAD]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], -12
 ; CHECK-NEXT:    store i32 [[TMP14]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, i32 [[TMP15]], i32 poison, float [[RETURNVALUE]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, i32 [[TMP15]], i32 poison, i32 poison, float [[RETURNVALUE]])
 ; CHECK-NEXT:    unreachable
 ;
diff --git a/llvmraytracing/test/lgccps/simple-await.ll b/llvmraytracing/test/lgccps/simple-await.ll
index c9ad1c82a6..24bf723e66 100644
--- a/llvmraytracing/test/lgccps/simple-await.ll
+++ b/llvmraytracing/test/lgccps/simple-await.ll
@@ -6,13 +6,13 @@
 
 declare !lgc.cps !0 void @callee({}, i32, float)
 
-define void @test({} %state, i32 %rcr, float %arg) !lgc.cps !0 {
+define void @test(i32 %shaderIndex, i32 %rcr, float %arg) !lgc.cps !0 {
   %t0 = fadd float %arg, 1.0
   %cr = call i32 @lgc.cps.as.continuation.reference(ptr @callee)
-  %t1 = call { float } (...) @lgc.cps.await__f32(i32 %cr, i32 2, float %t0), !continuation.returnedRegistercount !{i32 0}
-  %res = extractvalue { float } %t1, 0
-  %returnvalue = fmul float %res, %arg
-  call void (...) @lgc.cps.jump(i32 %rcr, i32 2,  i32 poison, i32 poison, float %returnvalue)
+  %t1 = call { i32, float } (...) @lgc.cps.await__sl_i32f32(i32 %cr, i32 2, i32 poison, float %t0), !continuation.returnedRegistercount !{i32 0}
+  %f.t1 = extractvalue { i32, float } %t1, 1
+  %returnvalue = fmul float %f.t1, %arg
+  call void (...) @lgc.cps.jump(i32 %rcr, i32 2, i32 poison, i32 poison, i32 poison, float %returnvalue)
   unreachable
 }
 !continuation.stackAddrspace = !{!1}
@@ -21,10 +21,10 @@ define void @test({} %state, i32 %rcr, float %arg) !lgc.cps !0 {
 !1 = !{i32 5}
 
 declare i32 @lgc.cps.as.continuation.reference(...) memory(none)
-declare { float } @lgc.cps.await__f32(...)
+declare { i32, float } @lgc.cps.await__sl_i32f32(...)
 declare void @lgc.cps.jump(...)
 ; CHECK-LABEL: define void @test(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], {} [[STATE:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]]) !lgc.cps [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] !continuation.stacksize [[META3:![0-9]+]] !continuation.state [[META3]] {
+; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[SHADERINDEX:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]]) !lgc.cps [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] !continuation.stacksize [[META3:![0-9]+]] !continuation.state [[META3]] {
 ; CHECK-NEXT:  AllocaSpillBB:
 ; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -32,58 +32,59 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 8
 ; CHECK-NEXT:    store i32 [[TMP3]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP2]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP5]], i32 0
-; CHECK-NEXT:    store float [[ARG]], ptr addrspace(5) [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(5)
+; CHECK-NEXT:    store i32 [[RCR]], ptr addrspace(5) [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP7]], i32 0
-; CHECK-NEXT:    store i32 [[RCR]], ptr addrspace(5) [[TMP8]], align 4
+; CHECK-NEXT:    store float [[ARG]], ptr addrspace(5) [[TMP8]], align 4
 ; CHECK-NEXT:    [[T0:%.*]] = fadd float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @callee)
 ; CHECK-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[CR]] to ptr
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @test.resume.0)
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP9]], i32 [[TMP1]], float [[T0]]), !continuation.returnedRegistercount [[META4:![0-9]+]]
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, i32 [[TMP9]], i32 poison, i32 [[TMP1]], float [[T0]]), !continuation.returnedRegistercount [[META4:![0-9]+]]
 ; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK-LABEL: define dso_local void @test.resume.0(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], float [[TMP2:%.*]]) !lgc.cps [[META1]] !continuation [[META2]] !continuation.registercount [[META4]] {
+; CHECK-SAME: i32 [[CSPINIT:%.*]], i32 [[TMP0:%.*]], float [[TMP1:%.*]]) !lgc.cps [[META1]] !continuation [[META2]] !continuation.registercount [[META4]] {
 ; CHECK-NEXT:  entryresume.0:
 ; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], -8
-; CHECK-NEXT:    [[TMP13:%.*]] = insertvalue { float } poison, float [[TMP2]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = insertvalue { i32, float } poison, i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue { i32, float } [[TMP13]], float [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i32 [[TMP5]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP6]], i32 0
-; CHECK-NEXT:    [[ARG_RELOAD:%.*]] = load float, ptr addrspace(5) [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i32 [[TMP4]] to ptr addrspace(5)
+; CHECK-NEXT:    [[RCR_RELOAD:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i32 [[TMP5]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP8]], i32 0
-; CHECK-NEXT:    [[RCR_RELOAD:%.*]] = load i32, ptr addrspace(5) [[TMP9]], align 4
-; CHECK-NEXT:    [[RES1:%.*]] = extractvalue { float } [[TMP13]], 0
-; CHECK-NEXT:    [[RETURNVALUE:%.*]] = fmul float [[RES1]], [[ARG_RELOAD]]
+; CHECK-NEXT:    [[ARG_RELOAD:%.*]] = load float, ptr addrspace(5) [[TMP9]], align 4
+; CHECK-NEXT:    [[F_T11:%.*]] = extractvalue { i32, float } [[TMP14]], 1
+; CHECK-NEXT:    [[RETURNVALUE:%.*]] = fmul float [[F_T11]], [[ARG_RELOAD]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], -8
 ; CHECK-NEXT:    store i32 [[TMP11]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, i32 [[TMP12]], i32 poison, float [[RETURNVALUE]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, i32 [[TMP12]], i32 poison, i32 poison, float [[RETURNVALUE]])
 ; CHECK-NEXT:    unreachable
 ;
 ;
 ; LOWER-AWAIT-LABEL: define { ptr, ptr } @test(
-; LOWER-AWAIT-SAME: {} [[STATE:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]], ptr [[TMP0:%.*]]) !lgc.cps [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] {
+; LOWER-AWAIT-SAME: i32 [[SHADERINDEX:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]], ptr [[TMP0:%.*]]) !lgc.cps [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] {
 ; LOWER-AWAIT-NEXT:    [[TMP2:%.*]] = call token @llvm.coro.id.retcon(i32 8, i32 4, ptr [[TMP0]], ptr @continuation.prototype.test, ptr @continuation.malloc, ptr @continuation.free)
-; LOWER-AWAIT-NEXT:    [[TMP3:%.*]] = call ptr @llvm.coro.begin(token [[TMP2]], ptr null)
+; LOWER-AWAIT-NEXT:    [[TMP3:%.*]] = call ptr @llvm.coro.begin.custom.abi(token [[TMP2]], ptr null, i32 0)
 ; LOWER-AWAIT-NEXT:    [[T0:%.*]] = fadd float [[ARG]], 1.000000e+00
 ; LOWER-AWAIT-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @callee)
 ; LOWER-AWAIT-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[CR]] to ptr
-; LOWER-AWAIT-NEXT:    [[TMP5:%.*]] = call ptr [[TMP4]](i32 [[CR]], i32 2, float [[T0]]), !continuation.returnedRegistercount [[META3:![0-9]+]]
+; LOWER-AWAIT-NEXT:    [[TMP5:%.*]] = call ptr [[TMP4]](i32 [[CR]], i32 2, i32 poison, float [[T0]]), !continuation.returnedRegistercount [[META3:![0-9]+]]
 ; LOWER-AWAIT-NEXT:    [[TMP6:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TMP5]])
-; LOWER-AWAIT-NEXT:    [[TMP8:%.*]] = call { float } @lgc.ilcps.getReturnValue__sl_f32s()
-; LOWER-AWAIT-NEXT:    [[TMP7:%.*]] = extractvalue { float } [[TMP8]], 0
+; LOWER-AWAIT-NEXT:    [[TMP8:%.*]] = call { i32, float } @lgc.ilcps.getReturnValue__sl_i32f32s()
+; LOWER-AWAIT-NEXT:    [[TMP7:%.*]] = extractvalue { i32, float } [[TMP8]], 1
 ; LOWER-AWAIT-NEXT:    [[RETURNVALUE:%.*]] = fmul float [[TMP7]], [[ARG]]
-; LOWER-AWAIT-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR]], i32 2, i32 poison, i32 poison, float [[RETURNVALUE]])
+; LOWER-AWAIT-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR]], i32 2, i32 poison, i32 poison, i32 poison, float [[RETURNVALUE]])
 ; LOWER-AWAIT-NEXT:    unreachable
 ;
diff --git a/llvmraytracing/test/lgccps/traversal-padding-hitattr-size.ll b/llvmraytracing/test/lgccps/traversal-padding-hitattr-size.ll
new file mode 100644
index 0000000000..a748d04f04
--- /dev/null
+++ b/llvmraytracing/test/lgccps/traversal-padding-hitattr-size.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; We run this test file twice with different max hit attribute sizes to test that e.g. padding depends correctly on the max hit attribute size.
+; RUN: grep -v HITATTR_SIZE_8  %s | opt --verify-each -passes='lower-raytracing-pipeline,lint' -S --lint-abort-on-error | FileCheck -check-prefix=CHECK-ATTRSIZE-16 %s
+; RUN: grep -v HITATTR_SIZE_16 %s | opt --verify-each -passes='lower-raytracing-pipeline,lint' -S --lint-abort-on-error | FileCheck -check-prefix=CHECK-ATTRSIZE-8 %s
+
+%struct.AnyHitTraversalData = type { i32 }
+%struct.DispatchSystemData = type { i32 }
+
+; Need _cont_ReportHit to get system data type
+declare !pointeetys !6 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
+declare !pointeetys !4 i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind)
+
+declare !pointeetys !6 void @_cont_ExitRayGen(%struct.DispatchSystemData*)
+
+declare i32 @_AmdGetCurrentFuncAddr()
+
+define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @_cont_Traversal(ptr addrspace(5) %0) local_unnamed_addr !lgc.shaderstage !0 !pointeetys !1 !lgc.rt.shaderstage !2 {
+; CHECK-ATTRSIZE-16-LABEL: define dso_local spir_func void @_cont_Traversal(
+; CHECK-ATTRSIZE-16-SAME: i32 [[SHADERINDEX:%.*]], i32 [[RETURNADDR:%.*]], { i32 } [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [4 x i32] [[PADDING:%.*]], [8 x i32] [[PAYLOAD:%.*]]) local_unnamed_addr !lgc.shaderstage [[META5:![0-9]+]] !lgc.rt.shaderstage [[META6:![0-9]+]] !continuation.registercount [[META0:![0-9]+]] !lgc.cps [[META7:![0-9]+]] !continuation [[META8:![0-9]+]] {
+; CHECK-ATTRSIZE-16-NEXT:  [[_ENTRY:.*:]]
+; CHECK-ATTRSIZE-16-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca { i32 }, align 8, addrspace(5)
+; CHECK-ATTRSIZE-16-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [8 x i32], align 4
+; CHECK-ATTRSIZE-16-NEXT:    store [8 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; CHECK-ATTRSIZE-16-NEXT:    store { i32 } [[SYSTEM_DATA]], ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], align 4
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], align 4
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP0]], -3
+; CHECK-ATTRSIZE-16-NEXT:    br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB4:.*]]
+; CHECK-ATTRSIZE-16:       [[BB2]]:
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP3:%.*]] = load [8 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; CHECK-ATTRSIZE-16-NEXT:    call void (...) @lgc.cps.jump(i32 poison, i32 -1, i32 poison, i32 poison, i32 poison, { i32 } poison, [4 x i32] poison, [8 x i32] [[TMP3]]), !continuation.registercount [[META0]]
+; CHECK-ATTRSIZE-16-NEXT:    unreachable
+; CHECK-ATTRSIZE-16:       [[BB4]]:
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP5:%.*]] = and i32 [[TMP0]], 8
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP6:%.*]] = icmp ne i32 [[TMP5]], 0
+; CHECK-ATTRSIZE-16-NEXT:    br i1 [[TMP6]], label %[[BB7:.*]], label %[[BB10:.*]]
+; CHECK-ATTRSIZE-16:       [[BB7]]:
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP8:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @_cont_Traversal)
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP9:%.*]] = load [8 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; CHECK-ATTRSIZE-16-NEXT:    call void (...) @lgc.cps.jump(i32 poison, i32 -1, i32 poison, i32 [[TMP8]], i32 poison, { i32 } poison, [4 x i32] poison, [8 x i32] [[TMP9]]), !continuation.registercount [[META0]]
+; CHECK-ATTRSIZE-16-NEXT:    unreachable
+; CHECK-ATTRSIZE-16:       [[BB10]]:
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP11:%.*]] = load [8 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; CHECK-ATTRSIZE-16-NEXT:    call void (...) @lgc.cps.jump(i32 poison, i32 -1, i32 poison, i32 poison, i32 poison, {} poison, [5 x i32] poison, [8 x i32] [[TMP11]]), !continuation.registercount [[META0]]
+; CHECK-ATTRSIZE-16-NEXT:    unreachable
+;
+; CHECK-ATTRSIZE-8-LABEL: define dso_local spir_func void @_cont_Traversal(
+; CHECK-ATTRSIZE-8-SAME: i32 [[SHADERINDEX:%.*]], i32 [[RETURNADDR:%.*]], { i32 } [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [2 x i32] [[PADDING:%.*]], [8 x i32] [[PAYLOAD:%.*]]) local_unnamed_addr !lgc.shaderstage [[META4:![0-9]+]] !lgc.rt.shaderstage [[META5:![0-9]+]] !continuation.registercount [[META0:![0-9]+]] !lgc.cps [[META6:![0-9]+]] !continuation [[META7:![0-9]+]] {
+; CHECK-ATTRSIZE-8-NEXT:  [[_ENTRY:.*:]]
+; CHECK-ATTRSIZE-8-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca { i32 }, align 8, addrspace(5)
+; CHECK-ATTRSIZE-8-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [8 x i32], align 4
+; CHECK-ATTRSIZE-8-NEXT:    store [8 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; CHECK-ATTRSIZE-8-NEXT:    store { i32 } [[SYSTEM_DATA]], ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], align 4
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], align 4
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP0]], -3
+; CHECK-ATTRSIZE-8-NEXT:    br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB4:.*]]
+; CHECK-ATTRSIZE-8:       [[BB2]]:
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP3:%.*]] = load [8 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; CHECK-ATTRSIZE-8-NEXT:    call void (...) @lgc.cps.jump(i32 poison, i32 -1, i32 poison, i32 poison, i32 poison, { i32 } poison, [2 x i32] poison, [8 x i32] [[TMP3]]), !continuation.registercount [[META0]]
+; CHECK-ATTRSIZE-8-NEXT:    unreachable
+; CHECK-ATTRSIZE-8:       [[BB4]]:
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP5:%.*]] = and i32 [[TMP0]], 8
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP6:%.*]] = icmp ne i32 [[TMP5]], 0
+; CHECK-ATTRSIZE-8-NEXT:    br i1 [[TMP6]], label %[[BB7:.*]], label %[[BB10:.*]]
+; CHECK-ATTRSIZE-8:       [[BB7]]:
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP8:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference(ptr @_cont_Traversal)
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP9:%.*]] = load [8 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; CHECK-ATTRSIZE-8-NEXT:    call void (...) @lgc.cps.jump(i32 poison, i32 -1, i32 poison, i32 [[TMP8]], i32 poison, { i32 } poison, [2 x i32] poison, [8 x i32] [[TMP9]]), !continuation.registercount [[META0]]
+; CHECK-ATTRSIZE-8-NEXT:    unreachable
+; CHECK-ATTRSIZE-8:       [[BB10]]:
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP11:%.*]] = load [8 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; CHECK-ATTRSIZE-8-NEXT:    call void (...) @lgc.cps.jump(i32 poison, i32 -1, i32 poison, i32 poison, i32 poison, {} poison, [3 x i32] poison, [8 x i32] [[TMP11]]), !continuation.registercount [[META0]]
+; CHECK-ATTRSIZE-8-NEXT:    unreachable
+;
+.entry:
+  %1 = load i32, ptr addrspace(5) %0, align 4
+  %2 = icmp ugt i32 %1, -3
+  br i1 %2, label %3, label %4
+
+3:
+  call void (...) @lgc.cps.jump(i32 poison, i32 -1, i32 poison, i32 poison, i32 poison, { i32 } poison)
+  unreachable
+
+4:
+  %5 = and i32 %1, 8
+  %6 = icmp ne i32 %5, 0
+  br i1 %6, label %7, label %8
+
+7:
+  %addr = call i32 @_AmdGetCurrentFuncAddr()
+  call void (...) @lgc.cps.jump(i32 poison, i32 -1, i32 poison, i32 %addr, i32 poison, { i32 } poison)
+  unreachable
+
+8:
+  call void (...) @lgc.cps.jump(i32 poison, i32 -1, i32 poison, i32 poison, i32 poison, { } poison)
+  unreachable
+}
+
+declare void @lgc.cps.jump(...) local_unnamed_addr
+
+!continuation.maxUsedPayloadRegisterCount = !{!5}
+!lgc.cps.module = !{}
+!lgc.rt.max.attribute.size = !{!3}
+
+!0 = !{i32 7}
+!1 = !{ { i32 } poison}
+!2 = !{i32 6}
+!3 = !{i32 16} ; HITATTR_SIZE_16
+!3 = !{i32 8}  ; HITATTR_SIZE_8
+!4 = !{%struct.AnyHitTraversalData poison}
+!5 = !{i32 8}
+!6 = !{%struct.DispatchSystemData poison}
+;.
+; CHECK-ATTRSIZE-16: [[META0]] = !{i32 8}
+; CHECK-ATTRSIZE-16: [[META5]] = !{i32 7}
+; CHECK-ATTRSIZE-16: [[META6]] = !{i32 6}
+; CHECK-ATTRSIZE-16: [[META7]] = !{i32 3}
+; CHECK-ATTRSIZE-16: [[META8]] = !{ptr @_cont_Traversal}
+;.
+; CHECK-ATTRSIZE-8: [[META0]] = !{i32 8}
+; CHECK-ATTRSIZE-8: [[META4]] = !{i32 7}
+; CHECK-ATTRSIZE-8: [[META5]] = !{i32 6}
+; CHECK-ATTRSIZE-8: [[META6]] = !{i32 3}
+; CHECK-ATTRSIZE-8: [[META7]] = !{ptr @_cont_Traversal}
+;.
diff --git a/llvmraytracing/unittests/CMakeLists.txt b/llvmraytracing/unittests/CMakeLists.txt
index 90b490a692..b9b62d3f95 100644
--- a/llvmraytracing/unittests/CMakeLists.txt
+++ b/llvmraytracing/unittests/CMakeLists.txt
@@ -1,3 +1,28 @@
+##
+ #######################################################################################################################
+ #
+ #  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ #
+ #  Permission is hereby granted, free of charge, to any person obtaining a copy
+ #  of this software and associated documentation files (the "Software"), to
+ #  deal in the Software without restriction, including without limitation the
+ #  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ #  sell copies of the Software, and to permit persons to whom the Software is
+ #  furnished to do so, subject to the following conditions:
+ #
+ #  The above copyright notice and this permission notice shall be included in all
+ #  copies or substantial portions of the Software.
+ #
+ #  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ #  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ #  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ #  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ #  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ #  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ #  IN THE SOFTWARE.
+ #
+ #######################################################################################################################
+
 # Raytracing Unit tests.
 # To execute all unit tests, run:
 #   cmake --build . --target check-raytracing-units
diff --git a/script/spv-to-shaderdb-test.py b/script/spv-to-shaderdb-test.py
index 7a3957025f..df048785bf 100644
--- a/script/spv-to-shaderdb-test.py
+++ b/script/spv-to-shaderdb-test.py
@@ -1,4 +1,30 @@
 #!/usr/bin/env python3
+##
+ #######################################################################################################################
+ #
+ #  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ #
+ #  Permission is hereby granted, free of charge, to any person obtaining a copy
+ #  of this software and associated documentation files (the "Software"), to
+ #  deal in the Software without restriction, including without limitation the
+ #  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ #  sell copies of the Software, and to permit persons to whom the Software is
+ #  furnished to do so, subject to the following conditions:
+ #
+ #  The above copyright notice and this permission notice shall be included in all
+ #  copies or substantial portions of the Software.
+ #
+ #  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ #  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ #  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ #  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ #  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ #  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ #  IN THE SOFTWARE.
+ #
+ #######################################################################################################################
+
+
 
 """
 spv-to-shaderdb-test.py -- Script to create a shaderdb test from spir-v assembly files.
diff --git a/test/amber/a16.amber b/test/amber/a16.amber
index eed863246b..9ec4d18287 100644
--- a/test/amber/a16.amber
+++ b/test/amber/a16.amber
@@ -50,7 +50,8 @@ layout(location = 0) out vec4 color_out;
 layout(set = 0, binding = 0) uniform highp sampler2D tex;
 
 void main() {
-// CHECK: image_sample_b {{.*}} a16
+// FIXME: Should be a16
+// CHECK: image_sample_b {{.*}}
   vec2 pos = (position_in + vec2(1.0)) / 2.0;
   float vi = round(vert_i); // Round to integer to get striped pattern
   // Convert address and bias to f16 to generate an a16 sample.
diff --git a/test/query_gfxip.py b/test/query_gfxip.py
index 72cfeac23e..7fea16d514 100755
--- a/test/query_gfxip.py
+++ b/test/query_gfxip.py
@@ -1,142 +1,60 @@
 #!/usr/bin/env python3
-# Map PCI device ids to a gfxip number
-# The list of PCI ids is taken from https://pci-ids.ucw.cz/read/PC/1002 under the terms of the 3-clause BSD License
+##
+ #######################################################################################################################
+ #
+ #  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ #
+ #  Permission is hereby granted, free of charge, to any person obtaining a copy
+ #  of this software and associated documentation files (the "Software"), to
+ #  deal in the Software without restriction, including without limitation the
+ #  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ #  sell copies of the Software, and to permit persons to whom the Software is
+ #  furnished to do so, subject to the following conditions:
+ #
+ #  The above copyright notice and this permission notice shall be included in all
+ #  copies or substantial portions of the Software.
+ #
+ #  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ #  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ #  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ #  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ #  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ #  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ #  IN THE SOFTWARE.
+ #
+ #######################################################################################################################
+
 
-# Copyright (c) <2021> <Martin Mares and Albert Pool>. All rights reserved.
-# Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
-# 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
-# 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-# 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# Map PCI device ids to a gfxip number
 
 import glob
 import sys
 
 VENDOR_AMD = "1002"
 
-device_to_gfxip = {
-    "13e9": "10.1",
-    "1478": "10.1",
-    "15d8": "9",
-    "15dd": "9",
-    "15e7": "9",
-    "1607": "10.3",
-    "1636": "9",
-    "1638": "9",
-    "163f": "10.3",
-    "164c": "9",
-    "164d": "10.3",
-    "1681": "10.3",
-
-    "66a0": "9",
-    "66a1": "9",
-    "66a2": "9",
-    "66a3": "9",
-    "66a7": "9",
-    "66af": "9",
-
-    "67c0": "8",
-    "67c2": "8",
-    "67c4": "8",
-    "67c7": "8",
-    "67ca": "8",
-    "67cc": "8",
-    "67cf": "8",
-    "67d0": "8",
-    "67d4": "8",
-    "67d7": "8",
-    "67df": "8",
-    "67e0": "8",
-    "67e1": "8",
-    "67e3": "8",
-    "67e8": "8",
-    "67e9": "8",
-    "67eb": "8",
-    "67ef": "8",
-    "67ff": "8",
-
-    "6860": "9",
-    "6861": "9",
-    "6862": "9",
-    "6863": "9",
-    "6864": "9",
-    "6867": "9",
-    "6868": "9",
-    "6869": "9",
-    "686a": "9",
-    "686b": "9",
-    "686c": "9",
-    "686d": "9",
-    "686e": "9",
-    "687f": "9",
-
-    "694c": "8",
-    "694e": "8",
-    "694f": "8",
-    "6980": "8",
-    "6986": "8",
-
-    "69a0": "9",
-    "69a1": "9",
-    "69a2": "9",
-    "69a3": "9",
-    "69af": "9",
-
-    "6fdf": "8",
-
-    "7310": "10.1",
-    "7312": "10.1",
-    "7314": "10.1",
-    "731f": "10.1",
-    "7340": "10.1",
-    "7341": "10.1",
-    "7347": "10.1",
-    "734f": "10.1",
-    "7360": "10.1",
-    "7362": "10.1",
-
-    "73a2": "10.3",
-    "73a3": "10.3",
-    "73a4": "10.3",
-    "73ab": "10.3",
-    "73af": "10.3",
-    "73bf": "10.3",
-    "73c3": "10.3",
-    "73c4": "10.3",
-    "73df": "10.3",
-    "73e0": "10.3",
-    "73e1": "10.3",
-    "73e3": "10.3",
-    "73e4": "10.3",
-    "73ff": "10.3",
-
-    "7408": "9",
-    "740c": "9",
-    "740f": "9",
-}
-
-def parse_gfxip(s):
-    """Returns [major, minor, patch]"""
-    arr = [int(i) for i in s.split(".")]
-    while len(arr) < 3:
-        arr.append(0)
-    return arr
+def get_gfxip(device):
+    # Get version of the GC (Graphics and Compute) block as exposed by the kernel
+    with open(f"/sys/class/drm/card{device}/device/ip_discovery/die/0/GC/0/major") as f:
+        major = int(f.read())
+    with open(f"/sys/class/drm/card{device}/device/ip_discovery/die/0/GC/0/minor") as f:
+        minor = int(f.read())
+    return [major, minor]
 
 def gfxip_to_str(ip):
     return ".".join([str(i) for i in ip])
 
-def find_gfxips(device_id):
-    """Find the gxfips of the given PCI device id"""
-    ip = parse_gfxip(device_to_gfxip[device_id])
-    [maj, min, pat] = ip
+def find_gfxips(device):
+    """Find the gfxips of the given PCI device id"""
+    ip = get_gfxip(device)
+    [maj, min] = get_gfxip(device)
 
-    gfxips = [[maj, min], [maj, min, pat]]
+    gfxips = [ip]
     for maj_i in range(9, maj + 1):
         gfxips.append([maj_i])
     return ["gfx" + gfxip_to_str(ip)] + ["gfx" + gfxip_to_str(i) + "+" for i in gfxips]
 
 def query_gfxips(device = None):
-    """Find all gxfips of device or of the first AMD GPU on the system"""
+    """Find all gfxips of device or of the first AMD GPU on the system"""
     if device is None:
         amd_cards = []
         for card in glob.glob("/sys/class/drm/card*"):
@@ -159,9 +77,12 @@ def query_gfxips(device = None):
         if vendor_id != f"0x{VENDOR_AMD}":
             raise Exception(f"Vendor {vendor_id} is not AMD (0x{VENDOR_AMD})")
 
-    with open("/sys/class/drm/card0/device/device") as f:
+    with open(f"/sys/class/drm/card{device}/device/device") as f:
         device_id = f.read().strip().replace("0x", "")
-    return find_gfxips(device_id)
+
+    gfxips = find_gfxips(device)
+    print(f"Chosen device: gfx{gfxip_to_str(get_gfxip(device))} (card{device}, pci id 0x{device_id})")
+    return gfxips
 
 if __name__ == '__main__':
     print(query_gfxips(None if len(sys.argv) < 2 else sys.argv[1]))
diff --git a/test/run_amber_test.py b/test/run_amber_test.py
index 4fb1dabbd5..f39b536f70 100755
--- a/test/run_amber_test.py
+++ b/test/run_amber_test.py
@@ -37,6 +37,21 @@
 import sys
 import tempfile
 
+def print_dump(dump_dir, file):
+  """Print a pipeline dump to stdout"""
+  print("Dump " + file)
+  with open(os.path.join(dump_dir, file)) as f:
+    print(f.read())
+
+def print_dump_dir(dir):
+  """Print all pipeline dumps from a directory to stdout"""
+  for f in os.listdir(dir):
+    if f.endswith(".pipe"):
+      print_dump(dir, f)
+    elif os.path.isdir(os.path.join(dir, f)):
+      # Search recursively, new drivers create a directory per app
+      print_dump_dir(os.path.join(dir, f))
+
 def main():
   parser = argparse.ArgumentParser(description='Run an amber test and print generated pipelines to stdout')
   parser.add_argument('--icd',
@@ -72,17 +87,14 @@ def main():
       os.environ["AMD_DEBUG_DIR"] = tmp_dir
       os.environ["HOME"] = tmp_dir
       dump_dir = os.path.join(tmp_dir, "spvPipeline")
+      os.makedirs(dump_dir)
 
       # Run amber
       cmd = ["amber"] + [args.file]
       res = subprocess.run(cmd)
 
       # Print pipeline dumps
-      for f in os.listdir(dump_dir):
-        if f.endswith(".pipe"):
-          print("Dump " + f)
-          with open(os.path.join(dump_dir, f)) as f:
-            print(f.read())
+      print_dump_dir(dump_dir)
 
       print("Dump End")
 
diff --git a/tool/dumper/vkgcPipelineDumper.cpp b/tool/dumper/vkgcPipelineDumper.cpp
index cd3c0f5ef8..c0bcbef7b7 100644
--- a/tool/dumper/vkgcPipelineDumper.cpp
+++ b/tool/dumper/vkgcPipelineDumper.cpp
@@ -714,6 +714,8 @@ void PipelineDumper::dumpPipelineShaderInfo(const PipelineShaderInfo *shaderInfo
   dumpFile << "options.imageSampleDrefReturnsRgba = " << shaderInfo->options.imageSampleDrefReturnsRgba << "\n";
   dumpFile << "options.disableGlPositionOpt = " << shaderInfo->options.disableGlPositionOpt << "\n";
   dumpFile << "options.viewIndexFromDeviceIndex = " << shaderInfo->options.viewIndexFromDeviceIndex << "\n";
+  dumpFile << "options.forceUnderflowPrevention = " << shaderInfo->options.forceUnderflowPrevention << "\n";
+  dumpFile << "options.forceMemoryBarrierScope = " << shaderInfo->options.forceMemoryBarrierScope << "\n";
   dumpFile << "\n";
   // clang-format on
 }
@@ -1320,13 +1322,14 @@ void PipelineDumper::dumpRayTracingStateInfo(const RayTracingPipelineBuildInfo *
   dumpFile << "attributeSizeMaxInLib = " << pipelineInfo->attributeSizeMaxInLib << "\n";
   dumpFile << "hasPipelineLibrary = " << pipelineInfo->hasPipelineLibrary << "\n";
   dumpFile << "pipelineLibStageMask = " << pipelineInfo->pipelineLibStageMask << "\n";
+  dumpFile << "rtIgnoreDeclaredPayloadSize = " << pipelineInfo->rtIgnoreDeclaredPayloadSize << "\n";
 
   for (unsigned i = 0; i < pipelineInfo->gpurtOptionCount; ++i) {
     auto gpurtOption = &pipelineInfo->pGpurtOptions[i];
     dumpFile << "gpurtOptions[" << i << "].nameHash = "
-             << "0x" << std::hex << gpurtOption->nameHash << "\n";
+             << "0x" << std::hex << gpurtOption->nameHash << std::dec << "\n";
     dumpFile << "gpurtOptions[" << i << "].value = "
-             << "0x" << std::hex << gpurtOption->value << "\n";
+             << "0x" << std::hex << gpurtOption->value << std::dec << "\n";
   }
 }
 
@@ -1608,10 +1611,10 @@ MetroHash::Hash PipelineDumper::generateHashForGraphicsPipeline(const GraphicsPi
   if (outLocationMaps != nullptr && unlinkedShaderType != UnlinkedStageFragment) {
     for (unsigned i = 0; i < ShaderStageFragment; ++i) {
       if (outLocationMaps[i].count > 0) {
-        hasher.Update(reinterpret_cast<const uint8_t *>(outLocationMaps->oldLocation),
-                      sizeof(uint32_t) * outLocationMaps->count);
-        hasher.Update(reinterpret_cast<const uint8_t *>(outLocationMaps->newLocation),
+        hasher.Update(reinterpret_cast<const uint8_t *>(outLocationMaps[i].oldLocation),
                       sizeof(uint32_t) * outLocationMaps->count);
+        hasher.Update(reinterpret_cast<const uint8_t *>(outLocationMaps[i].newLocation),
+                      sizeof(uint32_t) * outLocationMaps[i].count);
         hasher.Update(outLocationMaps[i].count);
       }
     }
@@ -1677,6 +1680,24 @@ MetroHash::Hash PipelineDumper::generateHashForComputePipeline(const ComputePipe
     updateHashForUniformConstantMap(pipeline->pUniformMap, &hasher);
   }
 
+  // Hash the graphics state for transform pipeline
+  auto transformPipeline = pipeline->transformGraphicsPipeline;
+  bool enableTransformPipeline = (transformPipeline != nullptr);
+  hasher.Update(enableTransformPipeline);
+
+  if (enableTransformPipeline) {
+    updateHashForPipelineShaderInfo(ShaderStageVertex, &transformPipeline->vs, isCacheHash, &hasher);
+    updateHashForResourceMappingInfo(&transformPipeline->resourceMapping,
+                                     transformPipeline->unlinked ? 0 : transformPipeline->pipelineLayoutApiHash,
+                                     &hasher);
+    hasher.Update(transformPipeline->unlinked);
+    hasher.Update(transformPipeline->dynamicTopology);
+    hasher.Update(transformPipeline->enableInitUndefZero);
+    updateHashForPipelineOptions(&transformPipeline->options, &hasher, isCacheHash, UnlinkedStageVertexProcess);
+    updateHashForVertexInputState(transformPipeline->pVertexInput, transformPipeline->dynamicVertexStride, &hasher);
+    updateHashForNonFragmentState(transformPipeline, isCacheHash, &hasher);
+  }
+
   MetroHash::Hash hash = {};
   hasher.Finalize(hash.bytes);
 
@@ -1731,6 +1752,7 @@ MetroHash::Hash PipelineDumper::generateHashForRayTracingPipeline(const RayTraci
 
   hasher.Update(pipeline->payloadSizeMaxInLib);
   hasher.Update(pipeline->attributeSizeMaxInLib);
+  hasher.Update(pipeline->rtIgnoreDeclaredPayloadSize);
 
 #if LLPC_CLIENT_INTERFACE_MAJOR_VERSION < 62
   if (isCacheHash) {
@@ -2002,6 +2024,18 @@ void PipelineDumper::updateHashForPipelineOptions(const PipelineOptions *options
   hasher->Update(options->getGlState().enableRemapLocation);
   // disablePerCompFetch has been handled in updateHashForNonFragmentState
   hasher->Update(options->optimizePointSizeWrite);
+  hasher->Update(options->compileConstInfo != nullptr);
+  if (options->compileConstInfo != nullptr) {
+    hasher->Update(options->compileConstInfo->numCompileTimeConstants);
+    for (uint32_t i = 0; i < options->compileConstInfo->numCompileTimeConstants; i++) {
+      auto constItem = options->compileConstInfo->pCompileTimeConstants[i];
+      hasher->Update(constItem.offset);
+      hasher->Update(constItem.set);
+      hasher->Update(constItem.binding);
+      hasher->Update(constItem.validBytes);
+      hasher->Update(constItem.values.u8, constItem.validBytes);
+    }
+  }
 }
 
 // =====================================================================================================================
@@ -2092,6 +2126,8 @@ void PipelineDumper::updateHashForPipelineShaderInfo(ShaderStage stage, const Pi
       hasher->Update(options.imageSampleDrefReturnsRgba);
       hasher->Update(options.disableGlPositionOpt);
       hasher->Update(options.viewIndexFromDeviceIndex);
+      hasher->Update(options.forceUnderflowPrevention);
+      hasher->Update(options.forceMemoryBarrierScope);
     }
   }
 }
diff --git a/tool/update_llpc_test_checks.py b/tool/update_llpc_test_checks.py
index dd7d81058f..0c65996fb1 100755
--- a/tool/update_llpc_test_checks.py
+++ b/tool/update_llpc_test_checks.py
@@ -1,4 +1,30 @@
 #!/usr/bin/env python3
+##
+ #######################################################################################################################
+ #
+ #  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ #
+ #  Permission is hereby granted, free of charge, to any person obtaining a copy
+ #  of this software and associated documentation files (the "Software"), to
+ #  deal in the Software without restriction, including without limitation the
+ #  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ #  sell copies of the Software, and to permit persons to whom the Software is
+ #  furnished to do so, subject to the following conditions:
+ #
+ #  The above copyright notice and this permission notice shall be included in all
+ #  copies or substantial portions of the Software.
+ #
+ #  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ #  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ #  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ #  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ #  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ #  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ #  IN THE SOFTWARE.
+ #
+ #######################################################################################################################
+
+
 
 # This script is based on the LLVM Project's update_test_checks.py, which is
 # licensed under the Apache License v2.0 with LLVM Exceptions; see the file
diff --git a/tool/vfx/vfxParser.cpp b/tool/vfx/vfxParser.cpp
index a87ee705d5..d62dc287f0 100644
--- a/tool/vfx/vfxParser.cpp
+++ b/tool/vfx/vfxParser.cpp
@@ -58,6 +58,7 @@ bool parseUint(char *str, unsigned lineNum, IUFValue *output);
 bool parseFloat(char *str, unsigned lineNum, IUFValue *output);
 bool parseFloat16(char *str, unsigned lineNum, IUFValue *output);
 bool parseDouble(char *str, unsigned lineNum, IUFValue *output);
+bool parseInt64(char *str, unsigned lineNum, IUFValue *output);
 
 bool parseBool(char *str, unsigned lineNum, IUFValue *output, std::string *errorMsg);
 
@@ -459,6 +460,13 @@ bool Document::parseKeyValue(char *key, char *valueStr, unsigned lineNum, Sectio
           result = accessedSectionObject->set(lineNum, memberName, arrayIndex, &(value.dVec2[0]));
         break;
       }
+      case MemberTypeInt64:
+      case MemberTypeUint64: {
+        result = parseInt64(valueStr, lineNum, &value);
+        if (result)
+          result = accessedSectionObject->set(lineNum, memberName, arrayIndex, &(value.i64Vec2[0]));
+        break;
+      }
       case MemberTypeBool: {
         result = parseBool(valueStr, lineNum, &value, &m_errorMsg);
         if (result) {
@@ -829,6 +837,34 @@ bool parseBool(char *str, unsigned lineNum, IUFValue *output, std::string *error
   return result;
 }
 
+// =====================================================================================================================
+// Parses a int64 number from a string.
+//
+// @param str : Input string
+// @param lineNum : Current line number
+// @param [out] output : Stores parsed value
+bool parseInt64(char *str, unsigned lineNum, IUFValue *output) {
+  VFX_ASSERT(output);
+  bool result = true;
+
+  bool isHex = false;
+  char *p0x = strstr(str, "0x");
+  if (p0x)
+    isHex = true;
+
+  if (isHex)
+    output->i64Vec2[0] = strtoull(str, nullptr, 0);
+  else
+    output->i64Vec2[0] = strtoll(str, nullptr, 0);
+
+  output->props.isInt64 = true;
+  output->props.isFloat = false;
+  output->props.isDouble = false;
+  output->props.length = 1;
+
+  return result;
+}
+
 // =====================================================================================================================
 // Parses a integer vec4 from a string.
 // NOTE: content of str will be changed.
diff --git a/tool/vfx/vfxSection.cpp b/tool/vfx/vfxSection.cpp
index 9669a6864d..a387e71953 100644
--- a/tool/vfx/vfxSection.cpp
+++ b/tool/vfx/vfxSection.cpp
@@ -351,6 +351,16 @@ void Section::printSelf(Document *pDoc, unsigned level) {
               printf("%s = %.3f\n", m_memberTable[i].memberName, *(((double *)(getMemberAddr(i))) + arrayIndex));
               break;
             }
+            case MemberTypeInt64: {
+              printf("%s = %" PRId64 "\n", m_memberTable[i].memberName,
+                     *(((int64_t *)(getMemberAddr(i))) + arrayIndex));
+              break;
+            }
+            case MemberTypeUint64: {
+              printf("%s = %" PRIu64 "\n", m_memberTable[i].memberName,
+                     *(((uint64_t *)(getMemberAddr(i))) + arrayIndex));
+              break;
+            }
             case MemberTypeIVec4: {
               IUFValue *iufValue = static_cast<IUFValue *>(getMemberAddr(i));
               iufValue += arrayIndex;
diff --git a/tool/vfx/vfxSection.h b/tool/vfx/vfxSection.h
index d11070f32c..9aa934cf28 100644
--- a/tool/vfx/vfxSection.h
+++ b/tool/vfx/vfxSection.h
@@ -87,6 +87,8 @@ enum MemberType : unsigned {
   MemberTypeFloat,                        // VFX member type: 32 bit float
   MemberTypeFloat16,                      // VFX member type: 16 bit float
   MemberTypeDouble,                       // VFX member type: 64 bit double
+  MemberTypeInt64,                        // VFX member type: 64bit integer
+  MemberTypeUint64,                       // VFX member type: 64bit unsigned integer
   MemberTypeBool,                         // VFX member type: boolean
   MemberTypeIVec4,                        // VFX member type: int vec4
   MemberTypeI64Vec2,                      // VFX member type: int64 vec2
diff --git a/tool/vfx/vfxVkSection.cpp b/tool/vfx/vfxVkSection.cpp
index a27f2c8f89..c77403f378 100644
--- a/tool/vfx/vfxVkSection.cpp
+++ b/tool/vfx/vfxVkSection.cpp
@@ -1,3 +1,28 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
 #include "vfxEnumsConverter.h"
 #include "vfxError.h"
 #include "vfxSection.h"
diff --git a/tool/vfx/vfxVkSection.h b/tool/vfx/vfxVkSection.h
index f6596a157d..de14386acb 100644
--- a/tool/vfx/vfxVkSection.h
+++ b/tool/vfx/vfxVkSection.h
@@ -1,3 +1,28 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
 #pragma once
 #include "vfxSection.h"
 
@@ -242,7 +267,7 @@ class SectionShaderOption : public Section {
     m_state.clientHash.upper = m_clientHash.i64Vec2[1];
     state = m_state;
   };
-  SubState &getSubStateRef() { return m_state; };
+  SubState &getSubStateRef() { return m_state; }
 
 private:
   static StrToMemberAddrArrayRef getAddrTable() {
@@ -298,6 +323,8 @@ class SectionShaderOption : public Section {
       INIT_STATE_MEMBER_NAME_TO_ADDR(SectionShaderOption, imageSampleDrefReturnsRgba, MemberTypeBool, false);
       INIT_STATE_MEMBER_NAME_TO_ADDR(SectionShaderOption, disableGlPositionOpt, MemberTypeBool, false);
       INIT_STATE_MEMBER_NAME_TO_ADDR(SectionShaderOption, viewIndexFromDeviceIndex, MemberTypeBool, false);
+      INIT_STATE_MEMBER_NAME_TO_ADDR(SectionShaderOption, forceUnderflowPrevention, MemberTypeBool, false);
+      INIT_STATE_MEMBER_NAME_TO_ADDR(SectionShaderOption, forceMemoryBarrierScope, MemberTypeInt, false);
       return addrTableInitializer;
     }();
     return {addrTable.data(), addrTable.size()};
@@ -1226,6 +1253,7 @@ class SectionRayTracingState : public Section {
       INIT_STATE_MEMBER_NAME_TO_ADDR(SectionRayTracingState, isReplay, MemberTypeBool, false);
       INIT_MEMBER_NAME_TO_ADDR(SectionRayTracingState, m_clientMetadata, MemberTypeU8Array, false);
       INIT_STATE_MEMBER_NAME_TO_ADDR(SectionRayTracingState, cpsFlags, MemberTypeInt, false);
+      INIT_STATE_MEMBER_NAME_TO_ADDR(SectionRayTracingState, rtIgnoreDeclaredPayloadSize, MemberTypeBool, false);
       INIT_MEMBER_DYNARRAY_NAME_TO_ADDR(SectionRayTracingState, m_gpurtOptions, MemberTypeGpurtOption, true);
       return addrTableInitializer;
     }();
diff --git a/util/extensions.txt b/util/extensions.txt
index 8bd2f7b5fe..d029a823d4 100644
--- a/util/extensions.txt
+++ b/util/extensions.txt
@@ -49,3 +49,4 @@ SPV_KHR_maximal_reconvergence
 SPV_KHR_expect_assume
 SPV_KHR_shader_quad_control
 SPV_KHR_subgroup_rotate
+SPV_EXT_replicated_composites
diff --git a/util/vkgcCapability.h b/util/vkgcCapability.h
index 97d66b65db..efff108a0e 100644
--- a/util/vkgcCapability.h
+++ b/util/vkgcCapability.h
@@ -165,6 +165,7 @@ static const char *const VkgcSupportedCapabilities[] = {
     "CapabilityExpectAssumeKHR",
     "CapabilityQuadControl",
     "CapabilityGroupNonUniformRotateKHR",
+    "CapabilityReplicatedCompositesEXT",
 };
 
 }; // namespace Vkgc
diff --git a/util/vkgcExtension.cpp b/util/vkgcExtension.cpp
index bd32b52fe8..4926364ee1 100644
--- a/util/vkgcExtension.cpp
+++ b/util/vkgcExtension.cpp
@@ -104,6 +104,7 @@ const ExtensionNamePair ExtensionNameTable[ExtensionCount] = {
     DeclExtensionName(KHR_SHADER_QUAD_CONTROL),
     DeclExtensionName(KHR_SUBGROUP_ROTATE),
     DeclExtensionName(KHR_COMPUTE_SHADER_DERIVATIVES),
+    DeclExtensionName(EXT_REPLICATED_COMPOSITES),
 };
 
 // =====================================================================================================================
diff --git a/util/vkgcExtension.h b/util/vkgcExtension.h
index 80874af8c2..450d3c886e 100644
--- a/util/vkgcExtension.h
+++ b/util/vkgcExtension.h
@@ -86,6 +86,7 @@ enum Extension : unsigned {
   KHR_SHADER_QUAD_CONTROL,
   KHR_SUBGROUP_ROTATE,
   KHR_COMPUTE_SHADER_DERIVATIVES,
+  EXT_REPLICATED_COMPOSITES,
   ExtensionCount,
 };
 
diff --git a/version/CMakeLists.txt b/version/CMakeLists.txt
index 1ce7218dea..62404537a9 100644
--- a/version/CMakeLists.txt
+++ b/version/CMakeLists.txt
@@ -122,3 +122,4 @@ endforeach()
 
 # Report the summary of what is enabled.
 message(STATUS "LLPC_BUILD_* summary: ${LLPC_BUILD_SUMMARY}")
+
diff --git a/version/include/llpc/GpurtIntrinsics.h b/version/include/llpc/GpurtIntrinsics.h
index 56308172b8..8d451867c2 100644
--- a/version/include/llpc/GpurtIntrinsics.h
+++ b/version/include/llpc/GpurtIntrinsics.h
@@ -62,7 +62,8 @@
 #endif
 #endif
 
-#define PASS_32_BIT_CR 1
+#define PASS_DUMMY_RET_ADDR 1
+#define PASS_SHADER_INDEX_ARG 1
 
 //=====================================================================================================================
 // Continuation intrinsics
@@ -212,9 +213,6 @@ GPURT_DECL uint32_t _AmdContStackGetPtr() DUMMY_GENERIC_FUNC(0)
 //
 //=====================================================================================================================
 // State (system data / hit attributes) modifier intrinsics
-// void _AmdRestoreSystemData*(in SystemData data)
-#define DECLARE_RESTORE_SYSTEM_DATA(Suffix, ...) GPURT_DECL \
-  void _AmdRestoreSystemData##Suffix(__VA_ARGS__) DUMMY_VOID_FUNC
 // void _AmdAcceptHitAttributes*(inout SystemData data)
 #define DECLARE_ACCEPT_HIT_ATTRIBUTES(Suffix, ...) GPURT_DECL \
   void _AmdAcceptHitAttributes##Suffix(__VA_ARGS__) DUMMY_VOID_FUNC
diff --git a/version/include/llpcVersion.h.in b/version/include/llpcVersion.h.in
index 8ab75db3e6..4979d9b48c 100644
--- a/version/include/llpcVersion.h.in
+++ b/version/include/llpcVersion.h.in
@@ -37,6 +37,8 @@
 //  %Version History
 //  | %Version | Change Description                                                                                    |
 //  | -------- | ----------------------------------------------------------------------------------------------------- |
+//  |     75.9 | Add rtIgnoreDeclaredPayloadSize to RayTracingPipelineBuildInfo.                                       |
+//  |     75.8 | Add forceMemoryBarrierScope to PipelineShaderOptions.                                                 |
 //  |     75.6 | Add enableRemapLocation to PipelineOptions. Add outLocationMaps to GraphicsPipelineBuildInfo.         |
 //  |     75.5 | Add optimizePointSizeWrite to PipelineShaderOptions in order to optimize the case PointSize = 1.0.    |
 //  |     75.4 | Add disableGlPositionOpt to PipelineShaderOptions.                                                    |
@@ -198,7 +200,7 @@
 #define LLPC_INTERFACE_MAJOR_VERSION 75
 
 /// LLPC minor interface version.
-#define LLPC_INTERFACE_MINOR_VERSION 6
+#define LLPC_INTERFACE_MINOR_VERSION 9
 
 /// The client's LLPC major interface version
 #ifndef LLPC_CLIENT_INTERFACE_MAJOR_VERSION