From d180bcf171ceb6d0d258fe681c46b05528a91c42 Mon Sep 17 00:00:00 2001
From: Konstantina Mitropoulou <Konstantina.Mitropoulou@amd.com>
Date: Wed, 11 Oct 2023 23:47:07 -0700
Subject: [PATCH] [LLPC] Scalarize non-uniform loads inside the waterfall loop

---
 include/vkgcDefs.h                            |   3 +-
 lgc/builder/BuilderImpl.cpp                   | 331 +++++++++++-------
 lgc/include/lgc/builder/BuilderImpl.h         |  11 +
 llpc/context/llpcPipelineContext.cpp          |  11 +-
 ...peSampledImage_TestWaterfallInsertion.frag |  28 +-
 ...peSampledImage_TestWaterfallScalarize.frag |  50 ++-
 ...age_TestWaterfallScalarize_MultiBlock.frag | 103 ++++--
 ...age_TestWaterfallScalarize_SharedDesc.frag |  76 ++--
 tool/dumper/vkgcPipelineDumper.cpp            |   3 +-
 9 files changed, 406 insertions(+), 210 deletions(-)
diff --git a/include/vkgcDefs.h b/include/vkgcDefs.h
index 2d8ff2159e..c88357a396 100644
--- a/include/vkgcDefs.h
+++ b/include/vkgcDefs.h
@@ -241,6 +241,7 @@ struct optional_bool : private std::optional<bool> {
   using std::optional<bool>::has_value;
   using std::optional<bool>::value;
   using std::optional<bool>::value_or;
+  using std::optional<bool>::operator*;
 };
 
 /// Enumerates result codes of LLPC operations.
@@ -873,7 +874,7 @@ struct PipelineShaderOptions {
   unsigned ldsSpillLimitDwords;
 
   /// Attempt to scalarize waterfall descriptor loads.
-  bool scalarizeWaterfallLoads;
+  optional_bool scalarizeWaterfallLoads;
 
   /// Force rearranges threadId within group into blocks of 8*8 or 8*4
   bool overrideForceThreadIdSwizzling;
diff --git a/lgc/builder/BuilderImpl.cpp b/lgc/builder/BuilderImpl.cpp
index 2a3197be06..428bddacc2 100644
--- a/lgc/builder/BuilderImpl.cpp
+++ b/lgc/builder/BuilderImpl.cpp
@@ -336,20 +336,18 @@ BranchInst *BuilderImpl::createIf(Value *condition, bool wantElse, const Twine &
 
 #if defined(LLVM_HAVE_BRANCH_AMD_GFX)
 // =====================================================================================================================
-// For a non-uniform input, try and trace back through a descriptor load to
-// find the non-uniform index used in it. If that fails, we just use the
-// operand value as the index.
-//
-// Note that this function may return null, which means that the given value has been shown to be uniform.
-//
-// This uses a fairly simple heuristic that nevertheless allows temporary expansion of the search breadth to handle
-// the common case where a base pointer is assembled from separate high and low halves.
-//
+
+// Support function for traceNonUniformIndex(). Get the load of the non-uniform operand of the non-uniform descriptorand
+// its chain.
 // @param nonUniformVal : Value representing non-uniform descriptor
-// @return : Value representing the non-uniform index, or null if nonUniformVal could be proven to be uniform
-static Value *traceNonUniformIndex(Value *nonUniformVal) {
+// @param loadChain: Chain of instructions that gives us the load of the non-uniform operand
+// @return : non-uniform load or nullptr if there is not any non-uniform load
+static LoadInst *getNonUniformLoad(Value *nonUniformVal, DenseMap<Value *, SmallVector<Value *, 2>> &loadChain) {
+
   auto load = dyn_cast<LoadInst>(nonUniformVal);
-  if (!load) {
+  if (load)
+    loadChain[load].push_back(load->getOperand(0));
+  else {
     // Workarounds that modify image descriptor can be peeped through, i.e.
     //   %baseValue = load <8 x i32>, <8 x i32> addrspace(4)* %..., align 16
     //   %rawElement = extractelement <8 x i32> %baseValue, i64 6
@@ -357,28 +355,53 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) {
     //   %nonUniform = insertelement <8 x i32> %baseValue, i32 %updatedElement, i64 6
     auto insert = dyn_cast<InsertElementInst>(nonUniformVal);
     if (!insert)
-      return nonUniformVal;
+      return nullptr;
 
     load = dyn_cast<LoadInst>(insert->getOperand(0));
     if (!load)
-      return nonUniformVal;
+      return nullptr;
 
     // We found the load, but must verify the chain.
     // Consider updatedElement as a generic instruction or constant.
-    if (auto updatedElement = dyn_cast<Instruction>(insert->getOperand(1))) {
+    Value *insertOp1 = insert->getOperand(1);
+    if (auto updatedElement = dyn_cast<Instruction>(insertOp1)) {
+      loadChain[insert].push_back(insertOp1);
       for (Value *operand : updatedElement->operands()) {
         if (auto extract = dyn_cast<ExtractElementInst>(operand)) {
           // Only dynamic value must be ExtractElementInst based on load.
-          if (dyn_cast<LoadInst>(extract->getOperand(0)) != load)
-            return nonUniformVal;
+          loadChain[insert].push_back(extract);
+          if (dyn_cast<LoadInst>(extract->getOperand(0)) != load) {
+            loadChain.clear();
+            return nullptr;
+          }
         } else if (!isa<Constant>(operand)) {
-          return nonUniformVal;
+          loadChain.clear();
+          return nullptr;
         }
       }
-    } else if (!isa<Constant>(insert->getOperand(1))) {
-      return nonUniformVal;
+    } else if (!isa<Constant>(insertOp1)) {
+      loadChain.clear();
+      return nullptr;
     }
+    loadChain[insert].push_back(load);
+    loadChain[insert].push_back(load->getOperand(0));
   }
+  return load;
+}
+
+// For a non-uniform input, try and trace back through a descriptor load to
+// find the non-uniform index used in it. If that fails, we just use the
+// operand value as the index.
+//
+// Note that this function may return null, which means that the given value has been shown to be uniform.
+//
+// This uses a fairly simple heuristic that nevertheless allows temporary expansion of the search breadth to handle
+// the common case where a base pointer is assembled from separate high and low halves.
+//
+// @param nonUniformVal : Value representing non-uniform descriptor
+// @param load: load operand of the non-uniform descriptor
+// @return : Value representing the non-uniform index, or nullptr if nonUniformVal could be proven to be uniform
+static Value *traceNonUniformIndex(Value *nonUniformVal, Instruction *load) {
 
   auto getSize = [](Value *value) -> uint64_t {
     uint64_t size = value->getType()->getPrimitiveSizeInBits().getFixedValue();
@@ -503,36 +526,151 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) {
 
   return candidateIndex;
 }
+#endif // Guard for amd gfx branch.
 
-// =====================================================================================================================
-// Test whether two instructions are identical
-// or are the same operation on identical operands.
-// @param lhs : First instruction
-// @param rhs : Second instruction
-// @return Result of equally test
-static bool instructionsEqual(Instruction *lhs, Instruction *rhs) {
-  if (lhs->isIdenticalTo(rhs))
-    return true;
-
-  if (!lhs->isSameOperationAs(rhs))
-    return false;
-
-  for (unsigned idx = 0, end = lhs->getNumOperands(); idx != end; ++idx) {
-    Value *lhsVal = lhs->getOperand(idx);
-    Value *rhsVal = rhs->getOperand(idx);
-    if (lhsVal == rhsVal)
-      continue;
-    Instruction *lhsInst = dyn_cast<Instruction>(lhsVal);
-    Instruction *rhsInst = dyn_cast<Instruction>(rhsVal);
-    if (!lhsInst || !rhsInst)
-      return false;
-    if (!lhsInst->isIdenticalTo(rhsInst))
-      return false;
+// Emit @llvm.amdgcn.waterfall.begin and @llvm.amdgcn.waterfall.readfirstlane intrinsics for non-uniform descriptors
+// which do not need scalarization.
+//
+// @param nonUniformInst : The instruction to put in a waterfall loop
+// @param operandIdxs : The operand index/indices for non-uniform inputs that need to be uniform
+// @param nonUniformIndices : Non-uniform operands of the non-uniform instruction
+// @param useVgprForOperands : Non-uniform inputs should be put in VGPRs
+// @param instName : Name to give instruction(s)
+// @return : llvm.amdgcn.waterfall.begin intrinsic
+Value *BuilderImpl::emitWaterfallBegin(Instruction *nonUniformInst, ArrayRef<unsigned> operandIdxs,
+                                       ArrayRef<Value *> nonUniformIndices, bool useVgprForOperands,
+                                       const Twine &instName) {
+  Value *waterfallBegin = nullptr;
+
+  // Insert new code just before nonUniformInst.
+  SetInsertPoint(nonUniformInst);
+
+  // The first begin contains a null token for the previous token argument
+  waterfallBegin = ConstantInt::get(getInt32Ty(), 0);
+  for (auto nonUniformIndex : nonUniformIndices)
+    // Start the waterfall loop using the waterfall index.
+    waterfallBegin = CreateIntrinsic(Intrinsic::amdgcn_waterfall_begin, nonUniformIndex->getType(),
+                                     {waterfallBegin, nonUniformIndex}, nullptr, instName);
+
+  // Scalarize each non-uniform operand of the instruction.
+  for (unsigned operandIdx : operandIdxs) {
+    Value *desc = nonUniformInst->getOperand(operandIdx);
+    auto descTy = desc->getType();
+#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 463892
+    // Old version of the code
+#else
+    // When the non-uniform use is in a VGPR, we can save a v_mov by not inserting the amdgcn_waterfall_readfirstlane
+    if (!useVgprForOperands)
+#endif
+    desc = CreateIntrinsic(Intrinsic::amdgcn_waterfall_readfirstlane, {descTy, descTy}, {waterfallBegin, desc}, nullptr,
+                           instName);
+    if (nonUniformInst->getType()->isVoidTy()) {
+      // The buffer/image operation we are waterfalling is a store with no return value. Use
+      // llvm.amdgcn.waterfall.last.use on the descriptor.
+#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 463892
+      // Old version of the code
+      desc = CreateIntrinsic(Intrinsic::amdgcn_waterfall_last_use, descTy, {waterfallBegin, desc}, nullptr, instName);
+#else
+      desc = CreateIntrinsic(useVgprForOperands ? Intrinsic::amdgcn_waterfall_last_use_vgpr
+                                                : Intrinsic::amdgcn_waterfall_last_use,
+                             descTy, {waterfallBegin, desc}, nullptr, instName);
+#endif
+    }
+    // Replace the descriptor operand in the buffer/image operation.
+    nonUniformInst->setOperand(operandIdx, desc);
   }
+  return waterfallBegin;
+}
+
+// Emit llvm.amdgcn.waterfall.begin and @llvm.amdgcn.waterfall.readfirstlane intrinsics for scalarized non-uniform
+// descriptors.
+//
+// @param nonUniformInst : The instruction to put in a waterfall loop
+// @param firstIndexInst : First shared index instruction
+// @param operandIdxs : The operand index/indices for non-uniform inputs that need to be uniform
+// @param nonUniformIndices : Non-uniform operands of the non-uniform instruction
+// @param loadChain : Non-uniform loads
+// @param useVgprForOperands : Non-uniform inputs should be put in VGPRs
+// @param instName : Name to give instruction(s)
+// @return : llvm.amdgcn.waterfall.begin intrinsic
+Value *BuilderImpl::emitWaterfallBeginForScalarizedLoops(Instruction *nonUniformInst, Instruction *firstIndexInst,
+                                                         ArrayRef<unsigned> operandIdxs,
+                                                         ArrayRef<Value *> nonUniformIndices,
+                                                         DenseMap<Value *, SmallVector<Value *, 2>> loadChain,
+                                                         bool useVgprForOperands, const Twine &instName) {
+
+  assert(firstIndexInst);
+  assert(nonUniformInst);
+
+  // Bail-out if we cannot handle any of the operands of nonUniformInst.
+  if (loadChain.empty())
+    return emitWaterfallBegin(nonUniformInst, operandIdxs, nonUniformIndices, useVgprForOperands, instName);
+
+  for (unsigned operandIdx : operandIdxs) {
+    Value *desc = nonUniformInst->getOperand(operandIdx);
+    if (!loadChain.contains(desc))
+      return emitWaterfallBegin(nonUniformInst, operandIdxs, nonUniformIndices, useVgprForOperands, instName);
+  }
+
+  auto descTy = firstIndexInst->getType();
+  SetInsertPoint(nonUniformInst);
+  Instruction *waterfallBegin = CreateIntrinsic(Intrinsic::amdgcn_waterfall_begin, descTy,
+                                                {ConstantInt::get(getInt32Ty(), 0), firstIndexInst}, nullptr, instName);
+  Instruction *readFirstLane = CreateIntrinsic(Intrinsic::amdgcn_waterfall_readfirstlane, {descTy, descTy},
+                                               {waterfallBegin, firstIndexInst}, nullptr, instName);
+
+  // Get the instructions that should be pushed in the loop from loadChain and copy them inside the waterfall loop.
+  for (unsigned operandIdx : operandIdxs) {
+    Value *desc = nonUniformInst->getOperand(operandIdx);
+    if (isa<LoadInst>(desc)) {
+      auto *origLoad = cast<LoadInst>(desc);
+      auto *newLoad = origLoad->clone();
+      auto *origGep = dyn_cast<GetElementPtrInst>(origLoad->getOperand(0));
+      auto *newGep = origGep->clone();
+      newLoad->insertBefore(nonUniformInst);
+      nonUniformInst->setOperand(operandIdx, newLoad);
+      newGep->insertBefore(newLoad);
+      newGep->setOperand(1, readFirstLane);
+      newLoad->setOperand(0, newGep);
+    } else if (isa<InsertElementInst>(desc)) {
+      // This case is for gfx 10.3.2
+      // Clone the instructions of loadChain.
+      auto *origInsert = cast<InsertElementInst>(desc);
+      auto *newInsert = origInsert->clone();
+      newInsert->insertBefore(nonUniformInst);
+      nonUniformInst->setOperand(operandIdx, newInsert);
+      std::map<Instruction *, Instruction *> origClonedValuesMap;
+      origClonedValuesMap[origInsert] = newInsert;
+      auto &valuesToClone = loadChain[origInsert];
+      Instruction *prevInst = newInsert;
+      for (auto *origVal : valuesToClone) {
+        auto *origInst = cast<Instruction>(origVal);
+        auto *newInst = origInst->clone();
+        newInst->insertBefore(prevInst);
+        origClonedValuesMap[origInst] = newInst;
+        prevInst = newInst;
+      }
 
-  return true;
+      // Update the operands of the cloned instructions.
+      for (auto [origInst, newInst] : origClonedValuesMap) {
+        for (Use &use : newInst->operands()) {
+          Value *op = use.get();
+          if (auto *opI = dyn_cast<Instruction>(op)) {
+            auto it = origClonedValuesMap.find(opI);
+            if (it == origClonedValuesMap.end())
+              continue;
+            Instruction *clonedI = it->second;
+            use.set(clonedI);
+          }
+        }
+
+        if (isa<GetElementPtrInst>(newInst))
+          newInst->setOperand(1, readFirstLane);
+      }
+    }
+  }
+  return waterfallBegin;
 }
-#endif
 
 // =====================================================================================================================
 // Create a waterfall loop containing the specified instruction.
@@ -554,14 +692,21 @@ Instruction *BuilderImpl::createWaterfallLoop(Instruction *nonUniformInst, Array
   assert(operandIdxs.empty() == false);
 
   SmallVector<Value *, 2> nonUniformIndices;
+  DenseMap<Value *, SmallVector<Value *, 2>> loadChain;
   for (unsigned operandIdx : operandIdxs) {
-    Value *nonUniformIndex = traceNonUniformIndex(nonUniformInst->getOperand(operandIdx));
+    Value *nonUniformVal = nonUniformInst->getOperand(operandIdx);
+    LoadInst *load = getNonUniformLoad(nonUniformVal, loadChain);
+    Value *nonUniformIndex = load ? traceNonUniformIndex(nonUniformVal, load) : nonUniformVal;
     if (nonUniformIndex)
       nonUniformIndices.push_back(nonUniformIndex);
   }
+
   if (nonUniformIndices.empty())
     return nonUniformInst;
 
+  if (nonUniformInst->getType()->isVoidTy())
+    scalarizeDescriptorLoads = false;
+
   // For any index that is 64 bit, change it back to 32 bit for comparison at the top of the
   // waterfall loop.
   for (Value *&nonUniformVal : nonUniformIndices) {
@@ -579,105 +724,27 @@ Instruction *BuilderImpl::createWaterfallLoop(Instruction *nonUniformInst, Array
   // Find first index instruction and check if index instructions are identical.
   Instruction *firstIndexInst = nullptr;
   if (scalarizeDescriptorLoads) {
-    // FIXME: these do not actually need to be identical if we introduce multiple waterfall
-    // begin and readlane intrinsics for these.
-    bool identicalIndexes = true;
     for (Value *nonUniformVal : nonUniformIndices) {
       Instruction *nuInst = dyn_cast<Instruction>(nonUniformVal);
-      // Note: parent check here guards use of comesBefore below
-      if (!nuInst || (firstIndexInst && !instructionsEqual(nuInst, firstIndexInst)) ||
-          (firstIndexInst && nuInst->getParent() != firstIndexInst->getParent())) {
-        identicalIndexes = false;
-        break;
-      }
       if (!firstIndexInst || nuInst->comesBefore(firstIndexInst))
         firstIndexInst = nuInst;
     }
-
     // Ensure we do not create a waterfall across blocks.
     // FIXME: we could use dominator check to allow scalarizing descriptor loads on multi-block spans;
     // however, this also requires backend support for multi-block waterfalls to be implemented.
-    if (!identicalIndexes || !firstIndexInst ||
-        (firstIndexInst && firstIndexInst->getParent() != nonUniformInst->getParent()))
+    if (!firstIndexInst || (firstIndexInst && firstIndexInst->getParent() != nonUniformInst->getParent()))
       scalarizeDescriptorLoads = false;
   }
 
   // Save Builder's insert point
   IRBuilder<>::InsertPointGuard guard(*this);
 
-  Value *waterfallBegin;
-  if (scalarizeDescriptorLoads) {
-    // Attempt to scalarize descriptor loads.
-    assert(firstIndexInst);
-    CallInst *firstCallInst = dyn_cast<CallInst>(firstIndexInst);
-    if (firstCallInst && firstCallInst->getIntrinsicID() == Intrinsic::amdgcn_waterfall_readfirstlane) {
-      // Descriptor loads are already inside a waterfall.
-      waterfallBegin = firstCallInst->getArgOperand(0);
-    } else {
-      // Begin waterfall loop just after shared index is computed.
-      // This places all dependent instructions within the waterfall loop, including descriptor loads.
-      auto descTy = firstIndexInst->getType();
-      SetInsertPoint(firstIndexInst->getNextNonDebugInstruction(false));
-      waterfallBegin = ConstantInt::get(getInt32Ty(), 0);
-      waterfallBegin = CreateIntrinsic(Intrinsic::amdgcn_waterfall_begin, descTy, {waterfallBegin, firstIndexInst},
-                                       nullptr, instName);
-
-      // Scalarize shared index.
-      Value *desc = CreateIntrinsic(Intrinsic::amdgcn_waterfall_readfirstlane, {descTy, descTy},
-                                    {waterfallBegin, firstIndexInst}, nullptr, instName);
-
-      // Replace all references to shared index within the waterfall loop with scalarized index.
-      // (Note: this includes the non-uniform instruction itself.)
-      // Loads using scalarized index will become scalar loads.
-      for (Value *otherNonUniformVal : nonUniformIndices) {
-        otherNonUniformVal->replaceUsesWithIf(desc, [desc, waterfallBegin, nonUniformInst](Use &U) {
-          Instruction *userInst = cast<Instruction>(U.getUser());
-          return U.getUser() != waterfallBegin && U.getUser() != desc &&
-                 userInst->getParent() == nonUniformInst->getParent() &&
-                 (userInst == nonUniformInst || userInst->comesBefore(nonUniformInst));
-        });
-      }
-    }
-  } else {
-    // Insert new code just before nonUniformInst.
-    SetInsertPoint(nonUniformInst);
-
-    // The first begin contains a null token for the previous token argument
-    waterfallBegin = ConstantInt::get(getInt32Ty(), 0);
-    for (auto nonUniformVal : nonUniformIndices) {
-      // Start the waterfall loop using the waterfall index.
-      waterfallBegin = CreateIntrinsic(Intrinsic::amdgcn_waterfall_begin, nonUniformVal->getType(),
-                                       {waterfallBegin, nonUniformVal}, nullptr, instName);
-    }
-
-    // Scalarize each non-uniform operand of the instruction.
-    for (unsigned operandIdx : operandIdxs) {
-      Value *desc = nonUniformInst->getOperand(operandIdx);
-      auto descTy = desc->getType();
-#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 463892
-      // Old version of the code
-#else
-      // When the non-uniform use is in a VGPR, we can save a v_mov by not inserting the amdgcn_waterfall_readfirstlane
-      if (!useVgprForOperands)
-#endif
-      desc = CreateIntrinsic(Intrinsic::amdgcn_waterfall_readfirstlane, {descTy, descTy}, {waterfallBegin, desc},
-                             nullptr, instName);
-      if (nonUniformInst->getType()->isVoidTy()) {
-        // The buffer/image operation we are waterfalling is a store with no return value. Use
-        // llvm.amdgcn.waterfall.last.use on the descriptor.
-#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 463892
-        // Old version of the code
-        desc = CreateIntrinsic(Intrinsic::amdgcn_waterfall_last_use, descTy, {waterfallBegin, desc}, nullptr, instName);
-#else
-        desc = CreateIntrinsic(useVgprForOperands ? Intrinsic::amdgcn_waterfall_last_use_vgpr
-                                                  : Intrinsic::amdgcn_waterfall_last_use,
-                               descTy, {waterfallBegin, desc}, nullptr, instName);
-#endif
-      }
-      // Replace the descriptor operand in the buffer/image operation.
-      nonUniformInst->setOperand(operandIdx, desc);
-    }
-  }
+  Value *waterfallBegin = nullptr;
+  if (scalarizeDescriptorLoads)
+    waterfallBegin = emitWaterfallBeginForScalarizedLoops(nonUniformInst, firstIndexInst, operandIdxs,
+                                                          nonUniformIndices, loadChain, useVgprForOperands, instName);
+  else
+    waterfallBegin = emitWaterfallBegin(nonUniformInst, operandIdxs, nonUniformIndices, useVgprForOperands, instName);
 
   Instruction *resultValue = nonUniformInst;
 
diff --git a/lgc/include/lgc/builder/BuilderImpl.h b/lgc/include/lgc/builder/BuilderImpl.h
index c758bb77ec..eb423ec033 100644
--- a/lgc/include/lgc/builder/BuilderImpl.h
+++ b/lgc/include/lgc/builder/BuilderImpl.h
@@ -132,6 +132,17 @@ class BuilderImpl : public BuilderDefs {
 
   LgcContext *m_builderContext; // Builder context
 
+  llvm::Value *emitWaterfallBegin(llvm::Instruction *nonUniformInst, llvm::ArrayRef<unsigned> operandIdxs,
+                                  llvm::ArrayRef<llvm::Value *> nonUniformIndices, bool useVgprForOperands = false,
+                                  const llvm::Twine &instName = "");
+
+  llvm::Value *
+  emitWaterfallBeginForScalarizedLoops(llvm::Instruction *nonUniformInst, llvm::Instruction *firstIndexInst,
+                                       llvm::ArrayRef<unsigned> operandIdxs,
+                                       llvm::ArrayRef<llvm::Value *> nonUniformIndices,
+                                       llvm::DenseMap<llvm::Value *, llvm::SmallVector<llvm::Value *, 2>> loadChain,
+                                       bool useVgprForOperands = false, const llvm::Twine &instName = "");
+
   // -------------------------------------------------------------------------------------------------------------------
   // Arithmetic operations
 public:
diff --git a/llpc/context/llpcPipelineContext.cpp b/llpc/context/llpcPipelineContext.cpp
index cf1105a45c..c21589412d 100644
--- a/llpc/context/llpcPipelineContext.cpp
+++ b/llpc/context/llpcPipelineContext.cpp
@@ -612,13 +612,12 @@ ShaderOptions PipelineContext::computeShaderOptions(const PipelineShaderInfo &sh
     }
   }
 
-  if (ScalarizeWaterfallDescriptorLoads.getNumOccurrences() > 0) {
+  if (ScalarizeWaterfallDescriptorLoads.getNumOccurrences() > 0)
     shaderOptions.scalarizeWaterfallLoads = ScalarizeWaterfallDescriptorLoads;
-  } else {
-    shaderOptions.scalarizeWaterfallLoads = shaderInfo.options.scalarizeWaterfallLoads;
-    // Enable waterfall load scalarization when vgpr limit is set.
-    if (shaderOptions.vgprLimit != 0 && shaderOptions.vgprLimit != UINT_MAX)
-      shaderOptions.scalarizeWaterfallLoads = true;
+  else {
+    shaderOptions.scalarizeWaterfallLoads = true;
+    if (shaderInfo.options.scalarizeWaterfallLoads.has_value())
+      shaderOptions.scalarizeWaterfallLoads = *shaderInfo.options.scalarizeWaterfallLoads;
   }
 
   shaderOptions.sgprLimit = shaderInfo.options.sgprLimit;
diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag
index 6845f3f011..cae39ae86b 100644
--- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag
+++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag
@@ -18,16 +18,18 @@ void main()
     _3 = texture(_11[nonuniformEXT(_12)], vec2(0.0));
 }
 
-// BEGIN_SHADERTEST
-/*
-; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
-; Make sure that the begin indices chosen are the non-uniform offsets rather than the whole resource desc
-; Make sure that there's a waterfall.readfirstlane for both the image resource desc and sample desc
-; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST-DAG: call i32 @llvm.amdgcn.waterfall.begin.i32
-; SHADERTEST-DAG: call i32 @llvm.amdgcn.waterfall.begin.i32
-; SHADERTEST-DAG: call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32
-; SHADERTEST-DAG: call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32
-; SHADERTEST: AMDLLPC SUCCESS
-*/
-// END_SHADERTEST
+// RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
+// Make sure that the begin indices chosen are the non-uniform offsets rather than the whole resource desc
+// Make sure that there's a waterfall.readfirstlane for both the image resource desc and sample desc
+// SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
+// SHADERTEST: %[[mul:[0-9]+]] = mul i32 %{{.*}}, 48
+// SHADERTEST-NEXT: %[[begin:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul]])
+// SHADERTEST-NEXT: %[[readfirstlane:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin]], i32 %[[mul]])
+// SHADERTEST-NEXT: %[[sext:[0-9]+]] = sext i32 %[[readfirstlane]] to i64
+// SHADERTEST-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
+// SHADERTEST-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// SHADERTEST-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
+// SHADERTEST-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// SHADERTEST-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> %[[load1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]])
+// SHADERTEST: AMDLLPC SUCCESS
diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag
index fbf9c25c0f..fe0307d622 100644
--- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag
+++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag
@@ -1,6 +1,3 @@
-// Make sure that there is a single begin index
-// Make sure that there is a single waterfall.readfirstlane for the offset
-
 #version 450
 #extension GL_EXT_nonuniform_qualifier : require
 
@@ -16,18 +13,37 @@ void main()
     _3 = texture(_11[nonuniformEXT(_12)], _6);
 }
 
-// BEGIN_SHADERTEST
-//
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=GFX %s
 // Explicitly check GFX10.3 ASIC variants:
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST %s
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST %s
-// SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-// SHADERTEST: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32
-// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32
-// SHADERTEST: AMDLLPC SUCCESS
-//
-// END_SHADERTEST
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=GFX %s
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=GFX_10_3_2 %s
+
+// GFX-LABEL: {{^// LLPC}} pipeline patching results
+// GFX: %[[mul:[0-9]+]] = mul i32 %{{.*}}, 48
+// GFX-NEXT: %[[begin:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul]])
+// GFX-NEXT: %[[readfirstlane:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin]], i32 %[[mul]])
+// GFX-NEXT: %[[sext:[0-9]+]] = sext i32 %[[readfirstlane]] to i64
+// GFX-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
+// GFX-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// GFX-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
+// GFX-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// GFX-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// GFX-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]])
+// GFX: AMDLLPC SUCCESS
+
+// GFX_10_3_2-LABEL: {{^// LLPC}} pipeline patching results
+// GFX_10_3_2: %[[mul:[0-9]+]] = mul i32 %{{.*}}, 48
+// GFX_10_3_2-NEXT: %[[begin:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul]])
+// GFX_10_3_2-NEXT: %[[readfirstlane:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin]], i32 %[[mul]])
+// GFX_10_3_2-NEXT: %[[sext:[0-9]+]] = sext i32 %[[readfirstlane]] to i64
+// GFX_10_3_2-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
+// GFX_10_3_2-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// GFX_10_3_2-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 6
+// GFX_10_3_2-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], -1048577
+// GFX_10_3_2-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[and]], i64 6
+// GFX_10_3_2-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+// GFX_10_3_2-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
+// GFX_10_3_2-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// GFX_10_3_2-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// GFX_10_3_2-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]])
+// GFX_10_3_2: AMDLLPC SUCCESS
diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag
index 82cd87a930..0355788085 100644
--- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag
+++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag
@@ -1,5 +1,5 @@
 // Make sure that there are two non-overlapping waterfall loops
-// First is scalarized and second is vector type
+// The first two loops are scalarized and the last one is vector type
 
 #version 450
 #extension GL_EXT_nonuniform_qualifier : require
@@ -25,24 +25,87 @@ void main()
     _3 = samp0 + samp1;
 }
 
-// BEGIN_SHADERTEST
-//
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=GFX %s
 // Explicitly check GFX10.3 ASIC variants:
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST %s
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST %s
-// SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-// SHADERTEST: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32
-// SHADERTEST: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST: call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32
-// SHADERTEST: call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32
-// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32
-// SHADERTEST: AMDLLPC SUCCESS
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=GFX %s
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=GFX_10_3_2 %s
+
+// GFX-LABEL: {{^// LLPC}} pipeline patching results
+// GFX: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48
+// GFX-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[mul1]] to i64
+// GFX-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// GFX-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// GFX-NEXT: %[[load1:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// GFX-NEXT: %[[load2:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// GFX-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// GFX-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]])
+// GFX-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64
+// GFX-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// GFX-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32
+// GFX-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// GFX-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16
+// GFX-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load3]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
+// GFX-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]])
+//
+// GFX-NEXT: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// GFX-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]])
+// GFX-NEXT: %[[sext3:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64
+// GFX-NEXT: %[[gep5:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]]
+// GFX-NEXT: %[[load5:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep5]], align 32
+// GFX-NEXT: %[[gep6:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]]
+// GFX-NEXT: %[[load6:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep6]], align 16
+// GFX-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load5]], <4 x i32> %[[load6]], i1 false, i32 0, i32 0)
+// GFX-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]])
+
+// GFX: %[[begin3:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// GFX-NEXT: %[[readfirstlane3:[0-9]+]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 %[[begin3]], <8 x i32> %[[load2]])
+// GFX-NEXT: %[[readfirstlane4:[0-9]+]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 %[[begin3]], <4 x i32> %[[load1]])
+// GFX-NEXT: [[image_call3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[readfirstlane3]], <4 x i32> %[[readfirstlane4]], i1 false, i32 0, i32 0)
+// GFX-NEXT: %[[end3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin3]], <4 x float> %[[image_call3]])
+// GFX: AMDLLPC SUCCESS
+
+// GFX_10_3_2-LABEL: {{^// LLPC}} pipeline patching results
+// GFX_10_3_2: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48
+// GFX_10_3_2-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[mul1]] to i64
+// GFX_10_3_2-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// GFX_10_3_2-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// GFX_10_3_2-NEXT: %[[load1:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// GFX_10_3_2-NEXT: %[[load2:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// GFX_10_3_2-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// GFX_10_3_2-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]])
+// GFX_10_3_2-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64
+// GFX_10_3_2-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// GFX_10_3_2-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32
+// GFX_10_3_2-NEXT: %[[extract1:[.a-z0-9]+]] = extractelement <8 x i32> %[[load3]], i64 6
+// GFX_10_3_2-NEXT: %[[and1:[0-9]+]] = and i32 %[[extract1]], -1048577
+// GFX_10_3_2-NEXT: %[[insert1:[.a-z0-9]+]] = insertelement <8 x i32> %[[load3]], i32 %[[and1]], i64 6
+// GFX_10_3_2-NEXT: %[[shufflevector1:[0-9]+]] = shufflevector <8 x i32> %[[insert1]], <8 x i32> %[[load3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+// GFX_10_3_2-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// GFX_10_3_2-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16
+// GFX_10_3_2-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector1]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
+// GFX_10_3_2-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]])
 //
-// END_SHADERTEST
+// GFX_10_3_2-NEXT: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// GFX_10_3_2-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]])
+// GFX_10_3_2-NEXT: %[[sext3:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64
+// GFX_10_3_2-NEXT: %[[gep5:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]]
+// GFX_10_3_2-NEXT: %[[load5:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep5]], align 32
+// GFX_10_3_2-NEXT: %[[extract1:[.a-z0-9]+]] = extractelement <8 x i32> %[[load5]], i64 6
+// GFX_10_3_2-NEXT: %[[and1:[0-9]+]] = and i32 %[[extract1]], -1048577
+// GFX_10_3_2-NEXT: %[[insert1:[.a-z0-9]+]] = insertelement <8 x i32> %[[load5]], i32 %[[and1]], i64 6
+// GFX_10_3_2-NEXT: %[[shufflevector1:[0-9]+]] = shufflevector <8 x i32> %[[insert1]], <8 x i32> %[[load5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+// GFX_10_3_2-NEXT: %[[gep6:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]]
+// GFX_10_3_2-NEXT: %[[load6:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep6]], align 16
+// GFX_10_3_2-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector1]], <4 x i32> %[[load6]], i1 false, i32 0, i32 0)
+// GFX_10_3_2-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]])
+
+// GFX_10_3_2: %[[extract2:[.a-z0-9]+]] = extractelement <8 x i32> %[[load2]], i64 6
+// GFX_10_3_2: %[[and2:[0-9]+]] = and i32 %[[extract2]], -1048577
+// GFX_10_3_2-NEXT: %[[insert2:[.a-z0-9]+]] = insertelement <8 x i32> %[[load2]], i32 %[[and2]], i64 6
+// GFX_10_3_2-NEXT: %[[shufflevector2:[0-9]+]] = shufflevector <8 x i32> %[[insert2]], <8 x i32> %[[load2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+// GFX_10_3_2: %[[begin3:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// GFX_10_3_2-NEXT: %[[readfirstlane3:[0-9]+]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 %[[begin3]], <8 x i32> %[[shufflevector2]])
+// GFX_10_3_2-NEXT: %[[readfirstlane4:[0-9]+]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 %[[begin3]], <4 x i32> %[[load1]])
+// GFX_10_3_2-NEXT: [[image_call3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[readfirstlane3]], <4 x i32> %[[readfirstlane4]], i1 false, i32 0, i32 0)
+// GFX_10_3_2-NEXT: %[[end3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin3]], <4 x float> %[[image_call3]])
+// GFX_10_3_2: AMDLLPC SUCCESS
diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag
index 123a2bc917..c8cce2be79 100644
--- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag
+++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag
@@ -1,7 +1,3 @@
-// Make sure that there is a single begin index
-// Make sure that there is a single waterfall.readfirstlane for the offset
-// Make sure that there are two waterfall.end operations for the samples
-
 #version 450
 #extension GL_EXT_nonuniform_qualifier : require
 
@@ -20,21 +16,61 @@ void main()
     _3 = samp0 + samp1;
 }
 
-// BEGIN_SHADERTEST
-//
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=GFX %s
 // Explicitly check GFX10.3 ASIC variants:
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST %s
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST %s
-// SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-// SHADERTEST: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32
-// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32
-// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32
-// SHADERTEST: AMDLLPC SUCCESS
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=GFX %s
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=GFX_10_3_2 %s
+
+// GFX-LABEL: {{^// LLPC}} pipeline patching results
+// GFX: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48
+// GFX-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// GFX-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]])
+// GFX-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64
+// GFX-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// GFX-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// GFX-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// GFX-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// GFX-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// GFX-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]])
+//
+// GFX: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// GFX-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]])
+// GFX-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64
+// GFX-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// GFX-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32
+// GFX-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// GFX-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16
+// GFX-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load3]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
+// GFX-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]])
+// GFX: AMDLLPC SUCCESS
+
+// GFX_10_3_2-LABEL: {{^// LLPC}} pipeline patching results
+// GFX_10_3_2: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48
+// GFX_10_3_2-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// GFX_10_3_2-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]])
+// GFX_10_3_2-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64
+// GFX_10_3_2-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// GFX_10_3_2-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// GFX_10_3_2-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 6
+// GFX_10_3_2-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], -1048577
+// GFX_10_3_2-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[and]], i64 6
+// GFX_10_3_2-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+// GFX_10_3_2-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// GFX_10_3_2-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// GFX_10_3_2-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// GFX_10_3_2-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]])
 //
-// END_SHADERTEST
+// GFX_10_3_2: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// GFX_10_3_2-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]])
+// GFX_10_3_2-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64
+// GFX_10_3_2-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// GFX_10_3_2-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32
+// GFX_10_3_2-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load3]], i64 6
+// GFX_10_3_2-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], -1048577
+// GFX_10_3_2-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load3]], i32 %[[and]], i64 6
+// GFX_10_3_2-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+// GFX_10_3_2-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// GFX_10_3_2-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16
+// GFX_10_3_2-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
+// GFX_10_3_2-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]])
+// GFX_10_3_2: AMDLLPC SUCCESS
diff --git a/tool/dumper/vkgcPipelineDumper.cpp b/tool/dumper/vkgcPipelineDumper.cpp
index 76686cf557..b799441cc2 100644
--- a/tool/dumper/vkgcPipelineDumper.cpp
+++ b/tool/dumper/vkgcPipelineDumper.cpp
@@ -647,7 +647,8 @@ void PipelineDumper::dumpPipelineShaderInfo(const PipelineShaderInfo *shaderInfo
   dumpFile << "options.fastMathFlags = " << shaderInfo->options.fastMathFlags << "\n";
   dumpFile << "options.disableFastMathFlags = " << shaderInfo->options.disableFastMathFlags << "\n";
   dumpFile << "options.ldsSpillLimitDwords = " << shaderInfo->options.ldsSpillLimitDwords << "\n";
-  dumpFile << "options.scalarizeWaterfallLoads = " << shaderInfo->options.scalarizeWaterfallLoads << "\n";
+  if (shaderInfo->options.scalarizeWaterfallLoads.has_value())
+    dumpFile << "options.scalarizeWaterfallLoads = " << *shaderInfo->options.scalarizeWaterfallLoads << "\n";
   dumpFile << "options.overrideShaderThreadGroupSizeX = " << shaderInfo->options.overrideShaderThreadGroupSizeX << "\n";
   dumpFile << "options.overrideShaderThreadGroupSizeY = " << shaderInfo->options.overrideShaderThreadGroupSizeY << "\n";
   dumpFile << "options.overrideShaderThreadGroupSizeZ = " << shaderInfo->options.overrideShaderThreadGroupSizeZ << "\n";