From d180bcf171ceb6d0d258fe681c46b05528a91c42 Mon Sep 17 00:00:00 2001 From: Konstantina Mitropoulou Date: Wed, 11 Oct 2023 23:47:07 -0700 Subject: [PATCH] [LLPC] Scalarize non-uniform loads inside the waterfall loop --- include/vkgcDefs.h | 3 +- lgc/builder/BuilderImpl.cpp | 331 +++++++++++------- lgc/include/lgc/builder/BuilderImpl.h | 11 + llpc/context/llpcPipelineContext.cpp | 11 +- ...peSampledImage_TestWaterfallInsertion.frag | 28 +- ...peSampledImage_TestWaterfallScalarize.frag | 50 ++- ...age_TestWaterfallScalarize_MultiBlock.frag | 103 ++++-- ...age_TestWaterfallScalarize_SharedDesc.frag | 76 ++-- tool/dumper/vkgcPipelineDumper.cpp | 3 +- 9 files changed, 406 insertions(+), 210 deletions(-) diff --git a/include/vkgcDefs.h b/include/vkgcDefs.h index 2d8ff2159e..c88357a396 100644 --- a/include/vkgcDefs.h +++ b/include/vkgcDefs.h @@ -241,6 +241,7 @@ struct optional_bool : private std::optional { using std::optional::has_value; using std::optional::value; using std::optional::value_or; + using std::optional::operator*; }; /// Enumerates result codes of LLPC operations. @@ -873,7 +874,7 @@ struct PipelineShaderOptions { unsigned ldsSpillLimitDwords; /// Attempt to scalarize waterfall descriptor loads. - bool scalarizeWaterfallLoads; + optional_bool scalarizeWaterfallLoads; /// Force rearranges threadId within group into blocks of 8*8 or 8*4 bool overrideForceThreadIdSwizzling; diff --git a/lgc/builder/BuilderImpl.cpp b/lgc/builder/BuilderImpl.cpp index 2a3197be06..428bddacc2 100644 --- a/lgc/builder/BuilderImpl.cpp +++ b/lgc/builder/BuilderImpl.cpp @@ -336,20 +336,18 @@ BranchInst *BuilderImpl::createIf(Value *condition, bool wantElse, const Twine & #if defined(LLVM_HAVE_BRANCH_AMD_GFX) // ===================================================================================================================== -// For a non-uniform input, try and trace back through a descriptor load to -// find the non-uniform index used in it. If that fails, we just use the -// operand value as the index. -// -// Note that this function may return null, which means that the given value has been shown to be uniform. -// -// This uses a fairly simple heuristic that nevertheless allows temporary expansion of the search breadth to handle -// the common case where a base pointer is assembled from separate high and low halves. -// + +// Support function for traceNonUniformIndex(). Get the load of the non-uniform operand of the non-uniform descriptorand +// its chain. // @param nonUniformVal : Value representing non-uniform descriptor -// @return : Value representing the non-uniform index, or null if nonUniformVal could be proven to be uniform -static Value *traceNonUniformIndex(Value *nonUniformVal) { +// @param loadChain: Chain of instructions that gives us the load of the non-uniform operand +// @return : non-uniform load or nullptr if there is not any non-uniform load +static LoadInst *getNonUniformLoad(Value *nonUniformVal, DenseMap> &loadChain) { + auto load = dyn_cast(nonUniformVal); - if (!load) { + if (load) + loadChain[load].push_back(load->getOperand(0)); + else { // Workarounds that modify image descriptor can be peeped through, i.e. // %baseValue = load <8 x i32>, <8 x i32> addrspace(4)* %..., align 16 // %rawElement = extractelement <8 x i32> %baseValue, i64 6 @@ -357,28 +355,53 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) { // %nonUniform = insertelement <8 x i32> %baseValue, i32 %updatedElement, i64 6 auto insert = dyn_cast(nonUniformVal); if (!insert) - return nonUniformVal; + return nullptr; load = dyn_cast(insert->getOperand(0)); if (!load) - return nonUniformVal; + return nullptr; // We found the load, but must verify the chain. // Consider updatedElement as a generic instruction or constant. - if (auto updatedElement = dyn_cast(insert->getOperand(1))) { + Value *insertOp1 = insert->getOperand(1); + if (auto updatedElement = dyn_cast(insertOp1)) { + loadChain[insert].push_back(insertOp1); for (Value *operand : updatedElement->operands()) { if (auto extract = dyn_cast(operand)) { // Only dynamic value must be ExtractElementInst based on load. - if (dyn_cast(extract->getOperand(0)) != load) - return nonUniformVal; + loadChain[insert].push_back(extract); + if (dyn_cast(extract->getOperand(0)) != load) { + loadChain.clear(); + return nullptr; + } } else if (!isa(operand)) { - return nonUniformVal; + loadChain.clear(); + return nullptr; } } - } else if (!isa(insert->getOperand(1))) { - return nonUniformVal; + } else if (!isa(insertOp1)) { + loadChain.clear(); + return nullptr; } + loadChain[insert].push_back(load); + loadChain[insert].push_back(load->getOperand(0)); } + return load; +} + +// For a non-uniform input, try and trace back through a descriptor load to +// find the non-uniform index used in it. If that fails, we just use the +// operand value as the index. +// +// Note that this function may return null, which means that the given value has been shown to be uniform. +// +// This uses a fairly simple heuristic that nevertheless allows temporary expansion of the search breadth to handle +// the common case where a base pointer is assembled from separate high and low halves. +// +// @param nonUniformVal : Value representing non-uniform descriptor +// @param load: load operand of the non-uniform descriptor +// @return : Value representing the non-uniform index, or nullptr if nonUniformVal could be proven to be uniform +static Value *traceNonUniformIndex(Value *nonUniformVal, Instruction *load) { auto getSize = [](Value *value) -> uint64_t { uint64_t size = value->getType()->getPrimitiveSizeInBits().getFixedValue(); @@ -503,36 +526,151 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) { return candidateIndex; } +#endif // Guard for amd gfx branch. -// ===================================================================================================================== -// Test whether two instructions are identical -// or are the same operation on identical operands. -// @param lhs : First instruction -// @param rhs : Second instruction -// @return Result of equally test -static bool instructionsEqual(Instruction *lhs, Instruction *rhs) { - if (lhs->isIdenticalTo(rhs)) - return true; - - if (!lhs->isSameOperationAs(rhs)) - return false; - - for (unsigned idx = 0, end = lhs->getNumOperands(); idx != end; ++idx) { - Value *lhsVal = lhs->getOperand(idx); - Value *rhsVal = rhs->getOperand(idx); - if (lhsVal == rhsVal) - continue; - Instruction *lhsInst = dyn_cast(lhsVal); - Instruction *rhsInst = dyn_cast(rhsVal); - if (!lhsInst || !rhsInst) - return false; - if (!lhsInst->isIdenticalTo(rhsInst)) - return false; +// Emit @llvm.amdgcn.waterfall.begin and @llvm.amdgcn.waterfall.readfirstlane intrinsics for non-uniform descriptors +// which do not need scalarization. +// +// @param nonUniformInst : The instruction to put in a waterfall loop +// @param operandIdxs : The operand index/indices for non-uniform inputs that need to be uniform +// @param nonUniformIndices : Non-uniform operands of the non-uniform instruction +// @param useVgprForOperands : Non-uniform inputs should be put in VGPRs +// @param instName : Name to give instruction(s) +// @return : llvm.amdgcn.waterfall.begin intrinsic +Value *BuilderImpl::emitWaterfallBegin(Instruction *nonUniformInst, ArrayRef operandIdxs, + ArrayRef nonUniformIndices, bool useVgprForOperands, + const Twine &instName) { + Value *waterfallBegin = nullptr; + + // Insert new code just before nonUniformInst. + SetInsertPoint(nonUniformInst); + + // The first begin contains a null token for the previous token argument + waterfallBegin = ConstantInt::get(getInt32Ty(), 0); + for (auto nonUniformIndex : nonUniformIndices) + // Start the waterfall loop using the waterfall index. + waterfallBegin = CreateIntrinsic(Intrinsic::amdgcn_waterfall_begin, nonUniformIndex->getType(), + {waterfallBegin, nonUniformIndex}, nullptr, instName); + + // Scalarize each non-uniform operand of the instruction. + for (unsigned operandIdx : operandIdxs) { + Value *desc = nonUniformInst->getOperand(operandIdx); + auto descTy = desc->getType(); +#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 463892 + // Old version of the code +#else + // When the non-uniform use is in a VGPR, we can save a v_mov by not inserting the amdgcn_waterfall_readfirstlane + if (!useVgprForOperands) +#endif + desc = CreateIntrinsic(Intrinsic::amdgcn_waterfall_readfirstlane, {descTy, descTy}, {waterfallBegin, desc}, nullptr, + instName); + if (nonUniformInst->getType()->isVoidTy()) { + // The buffer/image operation we are waterfalling is a store with no return value. Use + // llvm.amdgcn.waterfall.last.use on the descriptor. +#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 463892 + // Old version of the code + desc = CreateIntrinsic(Intrinsic::amdgcn_waterfall_last_use, descTy, {waterfallBegin, desc}, nullptr, instName); +#else + desc = CreateIntrinsic(useVgprForOperands ? Intrinsic::amdgcn_waterfall_last_use_vgpr + : Intrinsic::amdgcn_waterfall_last_use, + descTy, {waterfallBegin, desc}, nullptr, instName); +#endif + } + // Replace the descriptor operand in the buffer/image operation. + nonUniformInst->setOperand(operandIdx, desc); } + return waterfallBegin; +} + +// Emit llvm.amdgcn.waterfall.begin and @llvm.amdgcn.waterfall.readfirstlane intrinsics for scalarized non-uniform +// descriptors. +// +// @param nonUniformInst : The instruction to put in a waterfall loop +// @param firstIndexInst : First shared index instruction +// @param operandIdxs : The operand index/indices for non-uniform inputs that need to be uniform +// @param nonUniformIndices : Non-uniform operands of the non-uniform instruction +// @param loadChain : Non-uniform loads +// @param useVgprForOperands : Non-uniform inputs should be put in VGPRs +// @param instName : Name to give instruction(s) +// @return : llvm.amdgcn.waterfall.begin intrinsic +Value *BuilderImpl::emitWaterfallBeginForScalarizedLoops(Instruction *nonUniformInst, Instruction *firstIndexInst, + ArrayRef operandIdxs, + ArrayRef nonUniformIndices, + DenseMap> loadChain, + bool useVgprForOperands, const Twine &instName) { + + assert(firstIndexInst); + assert(nonUniformInst); + + // Bail-out if we cannot handle any of the operands of nonUniformInst. + if (loadChain.empty()) + return emitWaterfallBegin(nonUniformInst, operandIdxs, nonUniformIndices, useVgprForOperands, instName); + + for (unsigned operandIdx : operandIdxs) { + Value *desc = nonUniformInst->getOperand(operandIdx); + if (!loadChain.contains(desc)) + return emitWaterfallBegin(nonUniformInst, operandIdxs, nonUniformIndices, useVgprForOperands, instName); + } + + auto descTy = firstIndexInst->getType(); + SetInsertPoint(nonUniformInst); + Instruction *waterfallBegin = CreateIntrinsic(Intrinsic::amdgcn_waterfall_begin, descTy, + {ConstantInt::get(getInt32Ty(), 0), firstIndexInst}, nullptr, instName); + Instruction *readFirstLane = CreateIntrinsic(Intrinsic::amdgcn_waterfall_readfirstlane, {descTy, descTy}, + {waterfallBegin, firstIndexInst}, nullptr, instName); + + // Get the instructions that should be pushed in the loop from loadChain and copy them inside the waterfall loop. + for (unsigned operandIdx : operandIdxs) { + Value *desc = nonUniformInst->getOperand(operandIdx); + if (isa(desc)) { + auto *origLoad = cast(desc); + auto *newLoad = origLoad->clone(); + auto *origGep = dyn_cast(origLoad->getOperand(0)); + auto *newGep = origGep->clone(); + newLoad->insertBefore(nonUniformInst); + nonUniformInst->setOperand(operandIdx, newLoad); + newGep->insertBefore(newLoad); + newGep->setOperand(1, readFirstLane); + newLoad->setOperand(0, newGep); + } else if (isa(desc)) { + // This case is for gfx 10.3.2 + // Clone the instructions of loadChain. + auto *origInsert = cast(desc); + auto *newInsert = origInsert->clone(); + newInsert->insertBefore(nonUniformInst); + nonUniformInst->setOperand(operandIdx, newInsert); + std::map origClonedValuesMap; + origClonedValuesMap[origInsert] = newInsert; + auto &valuesToClone = loadChain[origInsert]; + Instruction *prevInst = newInsert; + for (auto *origVal : valuesToClone) { + auto *origInst = cast(origVal); + auto *newInst = origInst->clone(); + newInst->insertBefore(prevInst); + origClonedValuesMap[origInst] = newInst; + prevInst = newInst; + } - return true; + // Update the operands of the cloned instructions. + for (auto [origInst, newInst] : origClonedValuesMap) { + for (Use &use : newInst->operands()) { + Value *op = use.get(); + if (auto *opI = dyn_cast(op)) { + auto it = origClonedValuesMap.find(opI); + if (it == origClonedValuesMap.end()) + continue; + Instruction *clonedI = it->second; + use.set(clonedI); + } + } + + if (isa(newInst)) + newInst->setOperand(1, readFirstLane); + } + } + } + return waterfallBegin; } -#endif // ===================================================================================================================== // Create a waterfall loop containing the specified instruction. @@ -554,14 +692,21 @@ Instruction *BuilderImpl::createWaterfallLoop(Instruction *nonUniformInst, Array assert(operandIdxs.empty() == false); SmallVector nonUniformIndices; + DenseMap> loadChain; for (unsigned operandIdx : operandIdxs) { - Value *nonUniformIndex = traceNonUniformIndex(nonUniformInst->getOperand(operandIdx)); + Value *nonUniformVal = nonUniformInst->getOperand(operandIdx); + LoadInst *load = getNonUniformLoad(nonUniformVal, loadChain); + Value *nonUniformIndex = load ? traceNonUniformIndex(nonUniformVal, load) : nonUniformVal; if (nonUniformIndex) nonUniformIndices.push_back(nonUniformIndex); } + if (nonUniformIndices.empty()) return nonUniformInst; + if (nonUniformInst->getType()->isVoidTy()) + scalarizeDescriptorLoads = false; + // For any index that is 64 bit, change it back to 32 bit for comparison at the top of the // waterfall loop. for (Value *&nonUniformVal : nonUniformIndices) { @@ -579,105 +724,27 @@ Instruction *BuilderImpl::createWaterfallLoop(Instruction *nonUniformInst, Array // Find first index instruction and check if index instructions are identical. Instruction *firstIndexInst = nullptr; if (scalarizeDescriptorLoads) { - // FIXME: these do not actually need to be identical if we introduce multiple waterfall - // begin and readlane intrinsics for these. - bool identicalIndexes = true; for (Value *nonUniformVal : nonUniformIndices) { Instruction *nuInst = dyn_cast(nonUniformVal); - // Note: parent check here guards use of comesBefore below - if (!nuInst || (firstIndexInst && !instructionsEqual(nuInst, firstIndexInst)) || - (firstIndexInst && nuInst->getParent() != firstIndexInst->getParent())) { - identicalIndexes = false; - break; - } if (!firstIndexInst || nuInst->comesBefore(firstIndexInst)) firstIndexInst = nuInst; } - // Ensure we do not create a waterfall across blocks. // FIXME: we could use dominator check to allow scalarizing descriptor loads on multi-block spans; // however, this also requires backend support for multi-block waterfalls to be implemented. - if (!identicalIndexes || !firstIndexInst || - (firstIndexInst && firstIndexInst->getParent() != nonUniformInst->getParent())) + if (!firstIndexInst || (firstIndexInst && firstIndexInst->getParent() != nonUniformInst->getParent())) scalarizeDescriptorLoads = false; } // Save Builder's insert point IRBuilder<>::InsertPointGuard guard(*this); - Value *waterfallBegin; - if (scalarizeDescriptorLoads) { - // Attempt to scalarize descriptor loads. - assert(firstIndexInst); - CallInst *firstCallInst = dyn_cast(firstIndexInst); - if (firstCallInst && firstCallInst->getIntrinsicID() == Intrinsic::amdgcn_waterfall_readfirstlane) { - // Descriptor loads are already inside a waterfall. - waterfallBegin = firstCallInst->getArgOperand(0); - } else { - // Begin waterfall loop just after shared index is computed. - // This places all dependent instructions within the waterfall loop, including descriptor loads. - auto descTy = firstIndexInst->getType(); - SetInsertPoint(firstIndexInst->getNextNonDebugInstruction(false)); - waterfallBegin = ConstantInt::get(getInt32Ty(), 0); - waterfallBegin = CreateIntrinsic(Intrinsic::amdgcn_waterfall_begin, descTy, {waterfallBegin, firstIndexInst}, - nullptr, instName); - - // Scalarize shared index. - Value *desc = CreateIntrinsic(Intrinsic::amdgcn_waterfall_readfirstlane, {descTy, descTy}, - {waterfallBegin, firstIndexInst}, nullptr, instName); - - // Replace all references to shared index within the waterfall loop with scalarized index. - // (Note: this includes the non-uniform instruction itself.) - // Loads using scalarized index will become scalar loads. - for (Value *otherNonUniformVal : nonUniformIndices) { - otherNonUniformVal->replaceUsesWithIf(desc, [desc, waterfallBegin, nonUniformInst](Use &U) { - Instruction *userInst = cast(U.getUser()); - return U.getUser() != waterfallBegin && U.getUser() != desc && - userInst->getParent() == nonUniformInst->getParent() && - (userInst == nonUniformInst || userInst->comesBefore(nonUniformInst)); - }); - } - } - } else { - // Insert new code just before nonUniformInst. - SetInsertPoint(nonUniformInst); - - // The first begin contains a null token for the previous token argument - waterfallBegin = ConstantInt::get(getInt32Ty(), 0); - for (auto nonUniformVal : nonUniformIndices) { - // Start the waterfall loop using the waterfall index. - waterfallBegin = CreateIntrinsic(Intrinsic::amdgcn_waterfall_begin, nonUniformVal->getType(), - {waterfallBegin, nonUniformVal}, nullptr, instName); - } - - // Scalarize each non-uniform operand of the instruction. - for (unsigned operandIdx : operandIdxs) { - Value *desc = nonUniformInst->getOperand(operandIdx); - auto descTy = desc->getType(); -#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 463892 - // Old version of the code -#else - // When the non-uniform use is in a VGPR, we can save a v_mov by not inserting the amdgcn_waterfall_readfirstlane - if (!useVgprForOperands) -#endif - desc = CreateIntrinsic(Intrinsic::amdgcn_waterfall_readfirstlane, {descTy, descTy}, {waterfallBegin, desc}, - nullptr, instName); - if (nonUniformInst->getType()->isVoidTy()) { - // The buffer/image operation we are waterfalling is a store with no return value. Use - // llvm.amdgcn.waterfall.last.use on the descriptor. -#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 463892 - // Old version of the code - desc = CreateIntrinsic(Intrinsic::amdgcn_waterfall_last_use, descTy, {waterfallBegin, desc}, nullptr, instName); -#else - desc = CreateIntrinsic(useVgprForOperands ? Intrinsic::amdgcn_waterfall_last_use_vgpr - : Intrinsic::amdgcn_waterfall_last_use, - descTy, {waterfallBegin, desc}, nullptr, instName); -#endif - } - // Replace the descriptor operand in the buffer/image operation. - nonUniformInst->setOperand(operandIdx, desc); - } - } + Value *waterfallBegin = nullptr; + if (scalarizeDescriptorLoads) + waterfallBegin = emitWaterfallBeginForScalarizedLoops(nonUniformInst, firstIndexInst, operandIdxs, + nonUniformIndices, loadChain, useVgprForOperands, instName); + else + waterfallBegin = emitWaterfallBegin(nonUniformInst, operandIdxs, nonUniformIndices, useVgprForOperands, instName); Instruction *resultValue = nonUniformInst; diff --git a/lgc/include/lgc/builder/BuilderImpl.h b/lgc/include/lgc/builder/BuilderImpl.h index c758bb77ec..eb423ec033 100644 --- a/lgc/include/lgc/builder/BuilderImpl.h +++ b/lgc/include/lgc/builder/BuilderImpl.h @@ -132,6 +132,17 @@ class BuilderImpl : public BuilderDefs { LgcContext *m_builderContext; // Builder context + llvm::Value *emitWaterfallBegin(llvm::Instruction *nonUniformInst, llvm::ArrayRef operandIdxs, + llvm::ArrayRef nonUniformIndices, bool useVgprForOperands = false, + const llvm::Twine &instName = ""); + + llvm::Value * + emitWaterfallBeginForScalarizedLoops(llvm::Instruction *nonUniformInst, llvm::Instruction *firstIndexInst, + llvm::ArrayRef operandIdxs, + llvm::ArrayRef nonUniformIndices, + llvm::DenseMap> loadChain, + bool useVgprForOperands = false, const llvm::Twine &instName = ""); + // ------------------------------------------------------------------------------------------------------------------- // Arithmetic operations public: diff --git a/llpc/context/llpcPipelineContext.cpp b/llpc/context/llpcPipelineContext.cpp index cf1105a45c..c21589412d 100644 --- a/llpc/context/llpcPipelineContext.cpp +++ b/llpc/context/llpcPipelineContext.cpp @@ -612,13 +612,12 @@ ShaderOptions PipelineContext::computeShaderOptions(const PipelineShaderInfo &sh } } - if (ScalarizeWaterfallDescriptorLoads.getNumOccurrences() > 0) { + if (ScalarizeWaterfallDescriptorLoads.getNumOccurrences() > 0) shaderOptions.scalarizeWaterfallLoads = ScalarizeWaterfallDescriptorLoads; - } else { - shaderOptions.scalarizeWaterfallLoads = shaderInfo.options.scalarizeWaterfallLoads; - // Enable waterfall load scalarization when vgpr limit is set. - if (shaderOptions.vgprLimit != 0 && shaderOptions.vgprLimit != UINT_MAX) - shaderOptions.scalarizeWaterfallLoads = true; + else { + shaderOptions.scalarizeWaterfallLoads = true; + if (shaderInfo.options.scalarizeWaterfallLoads.has_value()) + shaderOptions.scalarizeWaterfallLoads = *shaderInfo.options.scalarizeWaterfallLoads; } shaderOptions.sgprLimit = shaderInfo.options.sgprLimit; diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag index 6845f3f011..cae39ae86b 100644 --- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag +++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag @@ -18,16 +18,18 @@ void main() _3 = texture(_11[nonuniformEXT(_12)], vec2(0.0)); } -// BEGIN_SHADERTEST -/* -; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s -; Make sure that the begin indices chosen are the non-uniform offsets rather than the whole resource desc -; Make sure that there's a waterfall.readfirstlane for both the image resource desc and sample desc -; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results -; SHADERTEST-DAG: call i32 @llvm.amdgcn.waterfall.begin.i32 -; SHADERTEST-DAG: call i32 @llvm.amdgcn.waterfall.begin.i32 -; SHADERTEST-DAG: call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32 -; SHADERTEST-DAG: call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32 -; SHADERTEST: AMDLLPC SUCCESS -*/ -// END_SHADERTEST +// RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s +// Make sure that the begin indices chosen are the non-uniform offsets rather than the whole resource desc +// Make sure that there's a waterfall.readfirstlane for both the image resource desc and sample desc +// SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results +// SHADERTEST: %[[mul:[0-9]+]] = mul i32 %{{.*}}, 48 +// SHADERTEST-NEXT: %[[begin:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul]]) +// SHADERTEST-NEXT: %[[readfirstlane:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin]], i32 %[[mul]]) +// SHADERTEST-NEXT: %[[sext:[0-9]+]] = sext i32 %[[readfirstlane]] to i64 +// SHADERTEST-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]] +// SHADERTEST-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32 +// SHADERTEST-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]] +// SHADERTEST-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16 +// SHADERTEST-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> %[[load1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0) +// SHADERTEST-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]]) +// SHADERTEST: AMDLLPC SUCCESS diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag index fbf9c25c0f..fe0307d622 100644 --- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag +++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag @@ -1,6 +1,3 @@ -// Make sure that there is a single begin index -// Make sure that there is a single waterfall.readfirstlane for the offset - #version 450 #extension GL_EXT_nonuniform_qualifier : require @@ -16,18 +13,37 @@ void main() _3 = texture(_11[nonuniformEXT(_12)], _6); } -// BEGIN_SHADERTEST -// -// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s +// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=GFX %s // Explicitly check GFX10.3 ASIC variants: -// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST %s -// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST %s -// SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results -// SHADERTEST: call i32 @llvm.amdgcn.waterfall.begin.i32 -// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32 -// SHADERTEST: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32 -// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32 -// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32 -// SHADERTEST: AMDLLPC SUCCESS -// -// END_SHADERTEST +// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=GFX %s +// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=GFX_10_3_2 %s + +// GFX-LABEL: {{^// LLPC}} pipeline patching results +// GFX: %[[mul:[0-9]+]] = mul i32 %{{.*}}, 48 +// GFX-NEXT: %[[begin:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul]]) +// GFX-NEXT: %[[readfirstlane:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin]], i32 %[[mul]]) +// GFX-NEXT: %[[sext:[0-9]+]] = sext i32 %[[readfirstlane]] to i64 +// GFX-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]] +// GFX-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32 +// GFX-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]] +// GFX-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16 +// GFX-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0) +// GFX-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]]) +// GFX: AMDLLPC SUCCESS + +// GFX_10_3_2-LABEL: {{^// LLPC}} pipeline patching results +// GFX_10_3_2: %[[mul:[0-9]+]] = mul i32 %{{.*}}, 48 +// GFX_10_3_2-NEXT: %[[begin:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul]]) +// GFX_10_3_2-NEXT: %[[readfirstlane:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin]], i32 %[[mul]]) +// GFX_10_3_2-NEXT: %[[sext:[0-9]+]] = sext i32 %[[readfirstlane]] to i64 +// GFX_10_3_2-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]] +// GFX_10_3_2-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32 +// GFX_10_3_2-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 6 +// GFX_10_3_2-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], -1048577 +// GFX_10_3_2-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[and]], i64 6 +// GFX_10_3_2-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load1]], <8 x i32> +// GFX_10_3_2-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]] +// GFX_10_3_2-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16 +// GFX_10_3_2-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0) +// GFX_10_3_2-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]]) +// GFX_10_3_2: AMDLLPC SUCCESS diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag index 82cd87a930..0355788085 100644 --- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag +++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag @@ -1,5 +1,5 @@ // Make sure that there are two non-overlapping waterfall loops -// First is scalarized and second is vector type +// The first two loops are scalarized and the last one is vector type #version 450 #extension GL_EXT_nonuniform_qualifier : require @@ -25,24 +25,87 @@ void main() _3 = samp0 + samp1; } -// BEGIN_SHADERTEST -// -// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s +// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=GFX %s // Explicitly check GFX10.3 ASIC variants: -// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST %s -// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST %s -// SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results -// SHADERTEST: call i32 @llvm.amdgcn.waterfall.begin.i32 -// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32 -// SHADERTEST: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32 -// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32 -// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32 -// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32 -// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32 -// SHADERTEST: call i32 @llvm.amdgcn.waterfall.begin.i32 -// SHADERTEST: call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32 -// SHADERTEST: call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32 -// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32 -// SHADERTEST: AMDLLPC SUCCESS +// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=GFX %s +// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=GFX_10_3_2 %s + +// GFX-LABEL: {{^// LLPC}} pipeline patching results +// GFX: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48 +// GFX-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[mul1]] to i64 +// GFX-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// GFX-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// GFX-NEXT: %[[load1:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16 +// GFX-NEXT: %[[load2:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32 +// GFX-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// GFX-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]]) +// GFX-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64 +// GFX-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// GFX-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32 +// GFX-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// GFX-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16 +// GFX-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load3]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0) +// GFX-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]]) +// +// GFX-NEXT: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// GFX-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]]) +// GFX-NEXT: %[[sext3:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64 +// GFX-NEXT: %[[gep5:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]] +// GFX-NEXT: %[[load5:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep5]], align 32 +// GFX-NEXT: %[[gep6:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]] +// GFX-NEXT: %[[load6:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep6]], align 16 +// GFX-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load5]], <4 x i32> %[[load6]], i1 false, i32 0, i32 0) +// GFX-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]]) + +// GFX: %[[begin3:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// GFX-NEXT: %[[readfirstlane3:[0-9]+]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 %[[begin3]], <8 x i32> %[[load2]]) +// GFX-NEXT: %[[readfirstlane4:[0-9]+]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 %[[begin3]], <4 x i32> %[[load1]]) +// GFX-NEXT: [[image_call3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[readfirstlane3]], <4 x i32> %[[readfirstlane4]], i1 false, i32 0, i32 0) +// GFX-NEXT: %[[end3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin3]], <4 x float> %[[image_call3]]) +// GFX: AMDLLPC SUCCESS + +// GFX_10_3_2-LABEL: {{^// LLPC}} pipeline patching results +// GFX_10_3_2: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48 +// GFX_10_3_2-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[mul1]] to i64 +// GFX_10_3_2-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// GFX_10_3_2-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// GFX_10_3_2-NEXT: %[[load1:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16 +// GFX_10_3_2-NEXT: %[[load2:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32 +// GFX_10_3_2-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// GFX_10_3_2-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]]) +// GFX_10_3_2-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64 +// GFX_10_3_2-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// GFX_10_3_2-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32 +// GFX_10_3_2-NEXT: %[[extract1:[.a-z0-9]+]] = extractelement <8 x i32> %[[load3]], i64 6 +// GFX_10_3_2-NEXT: %[[and1:[0-9]+]] = and i32 %[[extract1]], -1048577 +// GFX_10_3_2-NEXT: %[[insert1:[.a-z0-9]+]] = insertelement <8 x i32> %[[load3]], i32 %[[and1]], i64 6 +// GFX_10_3_2-NEXT: %[[shufflevector1:[0-9]+]] = shufflevector <8 x i32> %[[insert1]], <8 x i32> %[[load3]], <8 x i32> +// GFX_10_3_2-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// GFX_10_3_2-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16 +// GFX_10_3_2-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector1]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0) +// GFX_10_3_2-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]]) // -// END_SHADERTEST +// GFX_10_3_2-NEXT: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// GFX_10_3_2-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]]) +// GFX_10_3_2-NEXT: %[[sext3:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64 +// GFX_10_3_2-NEXT: %[[gep5:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]] +// GFX_10_3_2-NEXT: %[[load5:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep5]], align 32 +// GFX_10_3_2-NEXT: %[[extract1:[.a-z0-9]+]] = extractelement <8 x i32> %[[load5]], i64 6 +// GFX_10_3_2-NEXT: %[[and1:[0-9]+]] = and i32 %[[extract1]], -1048577 +// GFX_10_3_2-NEXT: %[[insert1:[.a-z0-9]+]] = insertelement <8 x i32> %[[load5]], i32 %[[and1]], i64 6 +// GFX_10_3_2-NEXT: %[[shufflevector1:[0-9]+]] = shufflevector <8 x i32> %[[insert1]], <8 x i32> %[[load5]], <8 x i32> +// GFX_10_3_2-NEXT: %[[gep6:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]] +// GFX_10_3_2-NEXT: %[[load6:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep6]], align 16 +// GFX_10_3_2-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector1]], <4 x i32> %[[load6]], i1 false, i32 0, i32 0) +// GFX_10_3_2-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]]) + +// GFX_10_3_2: %[[extract2:[.a-z0-9]+]] = extractelement <8 x i32> %[[load2]], i64 6 +// GFX_10_3_2: %[[and2:[0-9]+]] = and i32 %[[extract2]], -1048577 +// GFX_10_3_2-NEXT: %[[insert2:[.a-z0-9]+]] = insertelement <8 x i32> %[[load2]], i32 %[[and2]], i64 6 +// GFX_10_3_2-NEXT: %[[shufflevector2:[0-9]+]] = shufflevector <8 x i32> %[[insert2]], <8 x i32> %[[load2]], <8 x i32> +// GFX_10_3_2: %[[begin3:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// GFX_10_3_2-NEXT: %[[readfirstlane3:[0-9]+]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 %[[begin3]], <8 x i32> %[[shufflevector2]]) +// GFX_10_3_2-NEXT: %[[readfirstlane4:[0-9]+]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 %[[begin3]], <4 x i32> %[[load1]]) +// GFX_10_3_2-NEXT: [[image_call3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[readfirstlane3]], <4 x i32> %[[readfirstlane4]], i1 false, i32 0, i32 0) +// GFX_10_3_2-NEXT: %[[end3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin3]], <4 x float> %[[image_call3]]) +// GFX_10_3_2: AMDLLPC SUCCESS diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag index 123a2bc917..c8cce2be79 100644 --- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag +++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag @@ -1,7 +1,3 @@ -// Make sure that there is a single begin index -// Make sure that there is a single waterfall.readfirstlane for the offset -// Make sure that there are two waterfall.end operations for the samples - #version 450 #extension GL_EXT_nonuniform_qualifier : require @@ -20,21 +16,61 @@ void main() _3 = samp0 + samp1; } -// BEGIN_SHADERTEST -// -// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s +// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=GFX %s // Explicitly check GFX10.3 ASIC variants: -// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST %s -// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST %s -// SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results -// SHADERTEST: call i32 @llvm.amdgcn.waterfall.begin.i32 -// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32 -// SHADERTEST: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32 -// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32 -// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32 -// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32 -// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32 -// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32 -// SHADERTEST: AMDLLPC SUCCESS +// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=GFX %s +// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=GFX_10_3_2 %s + +// GFX-LABEL: {{^// LLPC}} pipeline patching results +// GFX: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48 +// GFX-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// GFX-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]]) +// GFX-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64 +// GFX-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// GFX-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32 +// GFX-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// GFX-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16 +// GFX-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0) +// GFX-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]]) +// +// GFX: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// GFX-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]]) +// GFX-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64 +// GFX-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// GFX-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32 +// GFX-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// GFX-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16 +// GFX-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load3]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0) +// GFX-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]]) +// GFX: AMDLLPC SUCCESS + +// GFX_10_3_2-LABEL: {{^// LLPC}} pipeline patching results +// GFX_10_3_2: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48 +// GFX_10_3_2-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// GFX_10_3_2-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]]) +// GFX_10_3_2-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64 +// GFX_10_3_2-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// GFX_10_3_2-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32 +// GFX_10_3_2-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 6 +// GFX_10_3_2-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], -1048577 +// GFX_10_3_2-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[and]], i64 6 +// GFX_10_3_2-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load1]], <8 x i32> +// GFX_10_3_2-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// GFX_10_3_2-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16 +// GFX_10_3_2-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0) +// GFX_10_3_2-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]]) // -// END_SHADERTEST +// GFX_10_3_2: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// GFX_10_3_2-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]]) +// GFX_10_3_2-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64 +// GFX_10_3_2-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// GFX_10_3_2-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32 +// GFX_10_3_2-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load3]], i64 6 +// GFX_10_3_2-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], -1048577 +// GFX_10_3_2-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load3]], i32 %[[and]], i64 6 +// GFX_10_3_2-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load3]], <8 x i32> +// GFX_10_3_2-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// GFX_10_3_2-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16 +// GFX_10_3_2-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0) +// GFX_10_3_2-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]]) +// GFX_10_3_2: AMDLLPC SUCCESS diff --git a/tool/dumper/vkgcPipelineDumper.cpp b/tool/dumper/vkgcPipelineDumper.cpp index 76686cf557..b799441cc2 100644 --- a/tool/dumper/vkgcPipelineDumper.cpp +++ b/tool/dumper/vkgcPipelineDumper.cpp @@ -647,7 +647,8 @@ void PipelineDumper::dumpPipelineShaderInfo(const PipelineShaderInfo *shaderInfo dumpFile << "options.fastMathFlags = " << shaderInfo->options.fastMathFlags << "\n"; dumpFile << "options.disableFastMathFlags = " << shaderInfo->options.disableFastMathFlags << "\n"; dumpFile << "options.ldsSpillLimitDwords = " << shaderInfo->options.ldsSpillLimitDwords << "\n"; - dumpFile << "options.scalarizeWaterfallLoads = " << shaderInfo->options.scalarizeWaterfallLoads << "\n"; + if (shaderInfo->options.scalarizeWaterfallLoads.has_value()) + dumpFile << "options.scalarizeWaterfallLoads = " << *shaderInfo->options.scalarizeWaterfallLoads << "\n"; dumpFile << "options.overrideShaderThreadGroupSizeX = " << shaderInfo->options.overrideShaderThreadGroupSizeX << "\n"; dumpFile << "options.overrideShaderThreadGroupSizeY = " << shaderInfo->options.overrideShaderThreadGroupSizeY << "\n"; dumpFile << "options.overrideShaderThreadGroupSizeZ = " << shaderInfo->options.overrideShaderThreadGroupSizeZ << "\n";