[AMDGPU] CodeGen for GFX12 S_WAIT_* instructions (llvm#77438)

Update SIMemoryLegalizer and SIInsertWaitcnts to use separate wait instructions per counter (e.g. S_WAIT_LOADCNT) and split VMCNT into separate LOADCNT, SAMPLECNT and BVHCNT counters.
ampandey-1995 · Jan 19, 2024 · cc728c1 · cc728c1
1 parent 5af0b8d
commit cc728c1
Show file tree

Hide file tree

Showing 109 changed files with 5,939 additions and 3,912 deletions.
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -1242,7 +1242,8 @@ bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
       case AMDGPU::S_WAITCNT: {
         const int64_t Imm = MI.getOperand(0).getImm();
         AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
-        return (Decoded.LgkmCnt == 0);
+        // DsCnt corresponds to LGKMCnt here.
+        return (Decoded.DsCnt == 0);
       }
       default:
         // SOPP instructions cannot mitigate the hazard.

diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1200,6 +1200,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
 
+  /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
+  /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
+  bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
+
   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
   /// SGPRs
   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;