From 7282d68314e9d3f9d32b2490ef12f9ceb0c6adc9 Mon Sep 17 00:00:00 2001
From: cdevadas <cdevadas@amd.com>
Date: Tue, 23 Jul 2019 18:19:36 +0530
Subject: [PATCH] moved the skip branch insertion code for correctness upfront
 during CF lowering

---
 llvm/lib/Target/AMDGPU/SIInsertSkips.cpp      | 33 +---------
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        | 13 ++++
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |  3 +
 llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp | 65 +++++++++++++++++--
 4 files changed, 77 insertions(+), 37 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
index 87e63fcc4a04fa..ca318a9ebb6a32 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -92,19 +92,6 @@ INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE,
 
 char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
 
-static bool opcodeEmitsNoInsts(const MachineInstr &MI) {
-  if (MI.isMetaInstruction())
-    return true;
-
-  // Handle target specific opcodes.
-  switch (MI.getOpcode()) {
-  case AMDGPU::SI_MASK_BRANCH:
-    return true;
-  default:
-    return false;
-  }
-}
-
 bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
                                const MachineBasicBlock &To) const {
   unsigned NumInstr = 0;
@@ -116,27 +103,9 @@ bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
 
     for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
          NumInstr < SkipThreshold && I != E; ++I) {
-      if (opcodeEmitsNoInsts(*I))
+      if (TII->opcodeEmitsNoInsts(*I))
         continue;
 
-      // FIXME: Since this is required for correctness, this should be inserted
-      // during SILowerControlFlow.
-
-      // When a uniform loop is inside non-uniform control flow, the branch
-      // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
-      // when EXEC = 0. We should skip the loop lest it becomes infinite.
-      if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
-          I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
-        return true;
-
-      if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
-        return true;
-
-      // These instructions are potentially expensive even if EXEC = 0.
-      if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
-          I->getOpcode() == AMDGPU::S_WAITCNT)
-        return true;
-
       ++NumInstr;
       if (NumInstr >= SkipThreshold)
         return true;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 1fce2dbe77433a..7403e5c63d649c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2677,6 +2677,19 @@ bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
          Opcode == AMDGPU::DS_GWS_BARRIER;
 }
 
+bool SIInstrInfo::opcodeEmitsNoInsts(const MachineInstr &MI) const {
+  if (MI.isMetaInstruction())
+    return true;
+
+  // Handle target specific opcodes.
+  switch (MI.getOpcode()) {
+  case AMDGPU::SI_MASK_BRANCH:
+    return true;
+  default:
+    return false;
+  }
+}
+
 bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {
   unsigned Opcode = MI.getOpcode();
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 3ff35da0b96307..f4b6454cd11ae1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -668,6 +668,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
       return MO.isReg() && RI.isVGPR(MRI, MO.getReg());});
   }
 
+  /// returns true if \p MI won't emit an instruction in the end.
+  bool opcodeEmitsNoInsts(const MachineInstr &MI) const;
+
   /// Whether we must prevent this instruction from executing with EXEC = 0.
   bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const;
 
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 78f409cd955520..0d2a17fac53cf2 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -102,6 +102,10 @@ class SILowerControlFlow : public MachineFunctionPass {
                         SmallVectorImpl<MachineOperand> &Src) const;
 
   void combineMasks(MachineInstr &MI);
+  MachineInstr *skipMaskBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
+
+  bool shouldSkip(const MachineBasicBlock &From,
+                  const MachineBasicBlock &To) const;
 
 public:
   static char ID;
@@ -230,8 +234,10 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
 
   // Insert a pseudo terminator to help keep the verifier happy. This will also
   // be used later when inserting skips.
-  MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
-                            .add(MI.getOperand(2));
+  MachineInstr *NewBr = skipMaskBranch(MI, *MI.getParent());
+  if (!NewBr)
+    NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
+                .add(MI.getOperand(2));
 
   if (!LIS) {
     MI.eraseFromParent();
@@ -307,9 +313,11 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
     .addReg(Exec)
     .addReg(DstReg);
 
-  MachineInstr *Branch =
-    BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
-    .addMBB(DestBB);
+  // Insert the skip branch if required.
+  MachineInstr *Branch = skipMaskBranch(MI, *MI.getParent());
+  if (!Branch)
+    Branch = BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
+                 .addMBB(DestBB);
 
   if (!LIS) {
     MI.eraseFromParent();
@@ -473,6 +481,53 @@ void SILowerControlFlow::combineMasks(MachineInstr &MI) {
     MRI->getUniqueVRegDef(Reg)->eraseFromParent();
 }
 
+bool SILowerControlFlow::shouldSkip(const MachineBasicBlock &From,
+                                    const MachineBasicBlock &To) const {
+  const MachineFunction *MF = From.getParent();
+
+  for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
+       MBBI != End && MBBI != ToI; ++MBBI) {
+    const MachineBasicBlock &MBB = *MBBI;
+
+    for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
+         I != E; ++I) {
+      if (TII->opcodeEmitsNoInsts(*I))
+        continue;
+
+      // When a uniform loop is inside non-uniform control flow, the branch
+      // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
+      // when EXEC = 0. We should skip the loop lest it becomes infinite.
+      if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
+          I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
+        return true;
+
+      if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
+        return true;
+
+      // These instructions are potentially expensive even if EXEC = 0.
+      if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
+          I->getOpcode() == AMDGPU::S_WAITCNT)
+        return true;
+    }
+  }
+  return false;
+}
+
+// Returns true if a branch over the block was inserted.
+MachineInstr *SILowerControlFlow::skipMaskBranch(MachineInstr &MI,
+                                                 MachineBasicBlock &SrcMBB) {
+  MachineBasicBlock *DestBB = MI.getOperand(2).getMBB();
+
+  if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB))
+    return nullptr;
+
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineBasicBlock::iterator InsPt = std::next(MI.getIterator());
+
+  return BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
+      .addMBB(DestBB);
+}
+
 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();