From 7282d68314e9d3f9d32b2490ef12f9ceb0c6adc9 Mon Sep 17 00:00:00 2001 From: cdevadas Date: Tue, 23 Jul 2019 18:19:36 +0530 Subject: [PATCH] moved the skip branch insertion code for correctness upfront during CF lowering --- llvm/lib/Target/AMDGPU/SIInsertSkips.cpp | 33 +--------- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 13 ++++ llvm/lib/Target/AMDGPU/SIInstrInfo.h | 3 + llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp | 65 +++++++++++++++++-- 4 files changed, 77 insertions(+), 37 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp index 87e63fcc4a04fa..ca318a9ebb6a32 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -92,19 +92,6 @@ INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE, char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID; -static bool opcodeEmitsNoInsts(const MachineInstr &MI) { - if (MI.isMetaInstruction()) - return true; - - // Handle target specific opcodes. - switch (MI.getOpcode()) { - case AMDGPU::SI_MASK_BRANCH: - return true; - default: - return false; - } -} - bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From, const MachineBasicBlock &To) const { unsigned NumInstr = 0; @@ -116,27 +103,9 @@ bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From, for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end(); NumInstr < SkipThreshold && I != E; ++I) { - if (opcodeEmitsNoInsts(*I)) + if (TII->opcodeEmitsNoInsts(*I)) continue; - // FIXME: Since this is required for correctness, this should be inserted - // during SILowerControlFlow. - - // When a uniform loop is inside non-uniform control flow, the branch - // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken - // when EXEC = 0. We should skip the loop lest it becomes infinite. - if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ || - I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ) - return true; - - if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) - return true; - - // These instructions are potentially expensive even if EXEC = 0. - if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) || - I->getOpcode() == AMDGPU::S_WAITCNT) - return true; - ++NumInstr; if (NumInstr >= SkipThreshold) return true; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 1fce2dbe77433a..7403e5c63d649c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2677,6 +2677,19 @@ bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { Opcode == AMDGPU::DS_GWS_BARRIER; } +bool SIInstrInfo::opcodeEmitsNoInsts(const MachineInstr &MI) const { + if (MI.isMetaInstruction()) + return true; + + // Handle target specific opcodes. + switch (MI.getOpcode()) { + case AMDGPU::SI_MASK_BRANCH: + return true; + default: + return false; + } +} + bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const { unsigned Opcode = MI.getOpcode(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 3ff35da0b96307..f4b6454cd11ae1 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -668,6 +668,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { return MO.isReg() && RI.isVGPR(MRI, MO.getReg());}); } + /// returns true if \p MI won't emit an instruction in the end. + bool opcodeEmitsNoInsts(const MachineInstr &MI) const; + /// Whether we must prevent this instruction from executing with EXEC = 0. bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const; diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index 78f409cd955520..0d2a17fac53cf2 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -102,6 +102,10 @@ class SILowerControlFlow : public MachineFunctionPass { SmallVectorImpl &Src) const; void combineMasks(MachineInstr &MI); + MachineInstr *skipMaskBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB); + + bool shouldSkip(const MachineBasicBlock &From, + const MachineBasicBlock &To) const; public: static char ID; @@ -230,8 +234,10 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { // Insert a pseudo terminator to help keep the verifier happy. This will also // be used later when inserting skips. - MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) - .add(MI.getOperand(2)); + MachineInstr *NewBr = skipMaskBranch(MI, *MI.getParent()); + if (!NewBr) + NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) + .add(MI.getOperand(2)); if (!LIS) { MI.eraseFromParent(); @@ -307,9 +313,11 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { .addReg(Exec) .addReg(DstReg); - MachineInstr *Branch = - BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) - .addMBB(DestBB); + // Insert the skip branch if required. + MachineInstr *Branch = skipMaskBranch(MI, *MI.getParent()); + if (!Branch) + Branch = BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) + .addMBB(DestBB); if (!LIS) { MI.eraseFromParent(); @@ -473,6 +481,53 @@ void SILowerControlFlow::combineMasks(MachineInstr &MI) { MRI->getUniqueVRegDef(Reg)->eraseFromParent(); } +bool SILowerControlFlow::shouldSkip(const MachineBasicBlock &From, + const MachineBasicBlock &To) const { + const MachineFunction *MF = From.getParent(); + + for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end(); + MBBI != End && MBBI != ToI; ++MBBI) { + const MachineBasicBlock &MBB = *MBBI; + + for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + if (TII->opcodeEmitsNoInsts(*I)) + continue; + + // When a uniform loop is inside non-uniform control flow, the branch + // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken + // when EXEC = 0. We should skip the loop lest it becomes infinite. + if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ || + I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ) + return true; + + if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) + return true; + + // These instructions are potentially expensive even if EXEC = 0. + if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) || + I->getOpcode() == AMDGPU::S_WAITCNT) + return true; + } + } + return false; +} + +// Returns true if a branch over the block was inserted. +MachineInstr *SILowerControlFlow::skipMaskBranch(MachineInstr &MI, + MachineBasicBlock &SrcMBB) { + MachineBasicBlock *DestBB = MI.getOperand(2).getMBB(); + + if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB)) + return nullptr; + + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock::iterator InsPt = std::next(MI.getIterator()); + + return BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) + .addMBB(DestBB); +} + bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo();