diff --git a/llpc/lower/llpcSpirvLowerConstImmediateStore.cpp b/llpc/lower/llpcSpirvLowerConstImmediateStore.cpp index a27cd4ad41..72dc74db09 100644 --- a/llpc/lower/llpcSpirvLowerConstImmediateStore.cpp +++ b/llpc/lower/llpcSpirvLowerConstImmediateStore.cpp @@ -32,6 +32,7 @@ #include "SPIRVInternal.h" #include "llpcContext.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/Debug.h" @@ -51,29 +52,21 @@ namespace Llpc { // @param [in/out] module : LLVM module to be run on (empty on entry) // @param [in/out] analysisManager : Analysis manager to use for this transformation PreservedAnalyses SpirvLowerConstImmediateStore::run(Module &module, ModuleAnalysisManager &analysisManager) { - runImpl(module); - return PreservedAnalyses::none(); -} - -// ===================================================================================================================== -// Executes this SPIR-V lowering pass on the specified LLVM module. -// -// @param [in/out] module : LLVM module to be run on -bool SpirvLowerConstImmediateStore::runImpl(Module &module) { LLVM_DEBUG(dbgs() << "Run the pass Spirv-Lower-Const-Immediate-Store\n"); SpirvLower::init(&module); // Process "alloca" instructions to see if they can be optimized to a read-only global // variable. - for (auto funcIt = module.begin(), funcItEnd = module.end(); funcIt != funcItEnd; ++funcIt) { - if (auto func = dyn_cast(&*funcIt)) { - if (!func->empty()) - processAllocaInsts(func); + bool changed = false; + for (auto &func : module.functions()) { + if (!func.empty()) { + if (processAllocaInsts(&func)) + changed |= true; } } - return true; + return changed ? PreservedAnalyses::allInSet() : PreservedAnalyses::all(); } // ===================================================================================================================== @@ -81,134 +74,158 @@ bool SpirvLowerConstImmediateStore::runImpl(Module &module) { // can be optimized to a read-only global variable. // // @param func : Function to process -void SpirvLowerConstImmediateStore::processAllocaInsts(Function *func) { +bool SpirvLowerConstImmediateStore::processAllocaInsts(Function *func) { // NOTE: We only visit the entry block on the basis that SPIR-V translator puts all "alloca" // instructions there. - auto entryBlock = &func->front(); - for (auto instIt = entryBlock->begin(), instItEnd = entryBlock->end(); instIt != instItEnd; ++instIt) { - auto inst = &*instIt; - if (auto allocaInst = dyn_cast(inst)) { - if (allocaInst->getAllocatedType()->isAggregateType()) { - // Got an "alloca" instruction of aggregate type. - auto storeInst = findSingleStore(allocaInst); - if (storeInst && isa(storeInst->getValueOperand())) { - // Got an aggregate "alloca" with a single store to the whole type. - // Do the optimization. - convertAllocaToReadOnlyGlobal(storeInst); - } - } + bool changed = false; + SmallVector candidates; + for (auto &inst : func->getEntryBlock()) { + if (auto allocaInst = dyn_cast(&inst)) { + if (allocaInst->getAllocatedType()->isAggregateType()) + candidates.push_back(allocaInst); } } + for (auto *alloca : candidates) { + if (tryProcessAlloca(alloca)) + changed |= true; + } + return changed; } // ===================================================================================================================== -// Finds the single "store" instruction storing to this pointer. -// -// Returns nullptr if no "store", multiple "store", or partial "store" instructions (store only part -// of the memory) are found. -// -// NOTE: This is conservative in that it returns nullptr if the pointer escapes by being used in anything -// other than "store" (as the pointer), "load" or "getelementptr" instruction. +// For a single alloca, try to replace it by a constant global variable. // // @param allocaInst : The "alloca" instruction to process -StoreInst *SpirvLowerConstImmediateStore::findSingleStore(AllocaInst *allocaInst) { - std::vector pointers; - bool isOuterPointer = true; - StoreInst *storeInstFound = nullptr; - Instruction *pointer = allocaInst; - while (true) { - for (auto useIt = pointer->use_begin(), useItEnd = pointer->use_end(); useIt != useItEnd; ++useIt) { - auto user = cast(useIt->getUser()); +// @return true if the alloca was replaced +bool SpirvLowerConstImmediateStore::tryProcessAlloca(AllocaInst *allocaInst) { + auto *allocatedTy = allocaInst->getAllocatedType(); + auto *arrayTy = dyn_cast(allocatedTy); + + StoreInst *aggregateStore = nullptr; + DenseMap elementStores; + + SmallVector toErase; + SmallVector geps; + + // Step 1: Determine if the alloca can be converted and find the relevant store(s) + SmallVector>> pointers; + pointers.emplace_back(allocaInst, 0); + do { + auto [pointer, index] = pointers.pop_back_val(); + for (Use &use : pointer->uses()) { + auto user = cast(use.getUser()); if (auto storeInst = dyn_cast(user)) { - if (pointer == storeInst->getValueOperand() || storeInstFound || !isOuterPointer) { - // Pointer escapes by being stored, or we have already found a "store" - // instruction, or this is a partial "store" instruction. - return nullptr; + if (use.getOperandNo() != storeInst->getPointerOperandIndex() || !index.has_value()) { + // Pointer escapes by being stored, or we store to a dynamically indexed (or otherwise complex) pointer. + return false; } - storeInstFound = storeInst; - } else if (auto getElemPtrInst = dyn_cast(user)) - pointers.push_back(getElemPtrInst); - else if (!isa(user) && !isAssumeLikeIntrinsic(user)) { - // Pointer escapes by being used in some way other than "load/store/getelementptr". - return nullptr; - } - } - if (pointers.empty()) - break; + Value *storeValue = storeInst->getValueOperand(); + if (!isa(storeValue)) + return false; - pointer = pointers.back(); - pointers.pop_back(); - isOuterPointer = false; - } + if (storeValue->getType() == allocatedTy) { + if (aggregateStore || !elementStores.empty() || index.value() != 0) + return false; + aggregateStore = storeInst; + continue; + } - return storeInstFound; -} + if (arrayTy && storeValue->getType() == arrayTy->getArrayElementType()) { + if (aggregateStore) + return false; + if (index.value() >= arrayTy->getArrayNumElements()) + return false; -// ===================================================================================================================== -// Converts an "alloca" instruction with a single constant store into a read-only global variable. -// -// NOTE: This erases the "store" instruction (so it will not be lowered by a later lowering pass -// any more) but not the "alloca" or replaced "getelementptr" instruction (they will be removed -// later by DCE pass). -// -// @param storeInst : The single constant store into the "alloca" -void SpirvLowerConstImmediateStore::convertAllocaToReadOnlyGlobal(StoreInst *storeInst) { - auto allocaInst = cast(storeInst->getPointerOperand()); - auto globalType = allocaInst->getAllocatedType(); - auto initVal = cast(storeInst->getValueOperand()); + if (!elementStores.try_emplace(index.value(), storeInst).second) + return false; + continue; + } - if (globalType != initVal->getType()) - return; + return false; + } - auto global = new GlobalVariable(*m_module, globalType, - true, // isConstant - GlobalValue::InternalLinkage, initVal, "", nullptr, GlobalValue::NotThreadLocal, - SPIRAS_Constant); - global->takeName(allocaInst); - // Change all uses of allocaInst to use global. We need to do it manually, as there is a change - // of address space, and we also need to recreate "getelementptr"s. - std::vector> allocaToGlobalMap; - allocaToGlobalMap.push_back(std::pair(allocaInst, global)); - do { - auto allocaInst = allocaToGlobalMap.back().first; - auto global = allocaToGlobalMap.back().second; - allocaToGlobalMap.pop_back(); - while (!allocaInst->use_empty()) { - auto useIt = allocaInst->use_begin(); - if (auto origGetElemPtrInst = dyn_cast(useIt->getUser())) { - // This use is a "getelementptr" instruction. Create a replacement one with the new address space. - SmallVector indices; - for (auto idxIt = origGetElemPtrInst->idx_begin(), idxItEnd = origGetElemPtrInst->idx_end(); idxIt != idxItEnd; - ++idxIt) - indices.push_back(*idxIt); - auto newGetElemPtrInst = GetElementPtrInst::Create(globalType, global, indices, "", origGetElemPtrInst); - newGetElemPtrInst->takeName(origGetElemPtrInst); - newGetElemPtrInst->setIsInBounds(origGetElemPtrInst->isInBounds()); - newGetElemPtrInst->copyMetadata(*origGetElemPtrInst); - // Remember that we need to replace the uses of the original "getelementptr" with the new one. - allocaToGlobalMap.push_back(std::pair(origGetElemPtrInst, newGetElemPtrInst)); - // Remove the use from the original "getelementptr". - *useIt = PoisonValue::get(allocaInst->getType()); + if (auto gep = dyn_cast(user)) { + geps.push_back(gep); + + if (index.has_value() && arrayTy && gep->getSourceElementType() == allocatedTy && + gep->hasAllConstantIndices() && gep->getNumIndices() == 2 && + cast(gep->getOperand(1))->isNullValue()) { + int64_t gepIdx = cast(gep->getOperand(2))->getSExtValue(); + if (gepIdx >= 0) { + pointers.emplace_back(gep, index.value() + gepIdx); + continue; + } + } + + pointers.emplace_back(gep, std::nullopt); continue; } - if (auto *intrinsic = dyn_cast(useIt->getUser())) { - switch (intrinsic->getIntrinsicID()) { - case Intrinsic::lifetime_start: - case Intrinsic::lifetime_end: - // Lifetime markers are only useful for allocas, not for globals, and if we did not erase them we would have - // to change their name mangling because of the change of address space. - intrinsic->eraseFromParent(); - continue; - } + if (isa(user)) + continue; + + if (isAssumeLikeIntrinsic(user)) { + toErase.push_back(user); + continue; } - *useIt = global; + // Pointer escapes by being used in some way other than "load/store/getelementptr". + return false; + } + } while (!pointers.empty()); + + // Step 2: Extract or build the initializer constant + Constant *initializer = nullptr; + + if (aggregateStore) { + initializer = cast(aggregateStore->getValueOperand()); + } else if (!elementStores.empty()) { + // Give up if the array is 4x larger than the number of element stores. This is a fairly arbitrary heuristic to + // prevent a super-linear blow-up of the size of IR. (Imagine input IR that defines a giant array and writes to + // only a single element of it.) + if (arrayTy->getArrayNumElements() / 4 > elementStores.size()) + return false; + + std::vector elements; + elements.resize(arrayTy->getArrayNumElements()); + + for (uint64_t index = 0; index < elements.size(); ++index) { + if (auto *store = elementStores.lookup(index)) + elements[index] = cast(store->getValueOperand()); + else + elements[index] = PoisonValue::get(arrayTy->getElementType()); } - // Visit next map pair. - } while (!allocaToGlobalMap.empty()); - storeInst->eraseFromParent(); + + initializer = ConstantArray::get(arrayTy, elements); + } else { + initializer = PoisonValue::get(allocatedTy); + } + + // Step 3: Create the global variable and replace the alloca + auto global = new GlobalVariable(*m_module, allocatedTy, + true, // isConstant + GlobalValue::InternalLinkage, initializer, "", nullptr, GlobalValue::NotThreadLocal, + SPIRAS_Constant); + global->takeName(allocaInst); + + for (Use &use : llvm::make_early_inc_range(allocaInst->uses())) + use.set(global); + + for (auto *gep : geps) + gep->mutateType(global->getType()); + + for (auto *inst : toErase) + inst->eraseFromParent(); + if (aggregateStore) + aggregateStore->eraseFromParent(); + for (auto [index, store] : elementStores) { + if (store) + store->eraseFromParent(); + } + allocaInst->eraseFromParent(); + + return true; } } // namespace Llpc diff --git a/llpc/lower/llpcSpirvLowerConstImmediateStore.h b/llpc/lower/llpcSpirvLowerConstImmediateStore.h index e65b45553d..db38ce5b38 100644 --- a/llpc/lower/llpcSpirvLowerConstImmediateStore.h +++ b/llpc/lower/llpcSpirvLowerConstImmediateStore.h @@ -45,14 +45,12 @@ namespace Llpc { class SpirvLowerConstImmediateStore : public SpirvLower, public llvm::PassInfoMixin { public: llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager); - bool runImpl(llvm::Module &module); static llvm::StringRef name() { return "Lower SPIR-V constant immediate store"; } private: - void processAllocaInsts(llvm::Function *func); - llvm::StoreInst *findSingleStore(llvm::AllocaInst *allocaInst); - void convertAllocaToReadOnlyGlobal(llvm::StoreInst *storeInst); + bool processAllocaInsts(llvm::Function *func); + bool tryProcessAlloca(llvm::AllocaInst *allocaInst); }; } // namespace Llpc diff --git a/llpc/test/shaderdb/general/TestCombineOfMultipleStoreInstructions.frag b/llpc/test/shaderdb/general/TestCombineOfMultipleStoreInstructions.frag new file mode 100644 index 0000000000..749a3d27a2 --- /dev/null +++ b/llpc/test/shaderdb/general/TestCombineOfMultipleStoreInstructions.frag @@ -0,0 +1,87 @@ +#version 460 + +// BEGIN_SHADERTEST +// RUN: amdllpc -verify-ir -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s + +// SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results +// SHADERTEST: @{{.*}} = internal unnamed_addr addrspace(4) constant [54 x float] [float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float -1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float -1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float -1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float -1.000000e+00, float 0.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float -1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float -1.000000e+00] +// SHADERTEST: AMDLLPC SUCCESS +// END_SHADERTEST + +layout(push_constant, std430) uniform RootConstants +{ + uvec2 _m0; + uint _m1; + uint _m2; + uint _m3; + uint _m4; + uint _m5; + uint _m6; + uint _m7; +} registers; + +layout(location = 0) out vec4 SV_Target; +float _49[54]; + +void main() +{ + _49[0u] = 0.0; + _49[1u] = 0.0; + _49[2u] = 1.0; + _49[3u] = 0.0; + _49[4u] = 1.0; + _49[5u] = 0.0; + _49[6u] = -1.0; + _49[7u] = 0.0; + _49[8u] = 0.0; + _49[9u] = 0.0; + _49[10u] = 0.0; + _49[11u] = -1.0; + _49[12u] = 0.0; + _49[13u] = 1.0; + _49[14u] = 0.0; + _49[15u] = 1.0; + _49[16u] = 0.0; + _49[17u] = 0.0; + _49[18u] = -1.0; + _49[19u] = 0.0; + _49[20u] = 0.0; + _49[21u] = 0.0; + _49[22u] = 0.0; + _49[23u] = 1.0; + _49[24u] = 0.0; + _49[25u] = 1.0; + _49[26u] = 0.0; + _49[27u] = -1.0; + _49[28u] = 0.0; + _49[29u] = 0.0; + _49[30u] = 0.0; + _49[31u] = 0.0; + _49[32u] = -1.0; + _49[33u] = 0.0; + _49[34u] = -1.0; + _49[35u] = 0.0; + _49[36u] = 1.0; + _49[37u] = 0.0; + _49[38u] = 0.0; + _49[39u] = 0.0; + _49[40u] = 1.0; + _49[41u] = 0.0; + _49[42u] = 0.0; + _49[43u] = 0.0; + _49[44u] = 1.0; + _49[45u] = -1.0; + _49[46u] = 0.0; + _49[47u] = 0.0; + _49[48u] = 0.0; + _49[49u] = 1.0; + _49[50u] = 0.0; + _49[51u] = 0.0; + _49[52u] = 0.0; + _49[53u] = -1.0; + + uvec4 _160 = uvec4(registers._m1, registers._m2, registers._m3, registers._m4); + SV_Target.z = _49[2u + _160.z]; + SV_Target.w = _49[3u + _160.w]; +} +