From 2cd8fc68e14cb5ae1a7f576ac84b4e3a5cc6111e Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Tue, 9 Jan 2024 10:24:38 +0000 Subject: [PATCH] [AMDGPU] CodeGen for GFX12 S_WAIT_* instructions Update SIMemoryLegalizer and SIInsertWaitcnts to use separate wait instructions per counter (e.g. S_WAIT_LOADCNT) and split VMCNT into separate LOADCNT, SAMPLECNT and BVHCNT counters. --- .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 3 +- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 4 + llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 1020 +++++++++++++---- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 3 +- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 37 +- llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 180 +++ llvm/lib/Target/AMDGPU/SOPInstructions.td | 7 + .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 146 ++- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 123 +- .../CodeGen/AMDGPU/GlobalISel/addsubu64.ll | 4 +- .../GlobalISel/clamp-fmed3-const-combine.ll | 48 +- .../GlobalISel/clamp-minmax-const-combine.ll | 102 +- .../GlobalISel/extractelement-stack-lower.ll | 24 +- .../CodeGen/AMDGPU/GlobalISel/flat-scratch.ll | 220 ++-- .../GlobalISel/fmed3-min-max-const-combine.ll | 108 +- .../llvm.amdgcn.global.atomic.csub.ll | 38 +- .../llvm.amdgcn.image.atomic.dim.a16.ll | 84 +- .../llvm.amdgcn.image.atomic.dim.ll | 84 +- .../llvm.amdgcn.image.gather4.a16.dim.ll | 28 +- .../llvm.amdgcn.image.gather4.dim.ll | 36 +- .../llvm.amdgcn.image.getresinfo.a16.ll | 16 +- .../llvm.amdgcn.image.getresinfo.ll | 22 +- .../llvm.amdgcn.image.load.1d.d16.ll | 28 +- .../GlobalISel/llvm.amdgcn.image.load.1d.ll | 30 +- .../GlobalISel/llvm.amdgcn.image.load.2d.ll | 6 +- .../llvm.amdgcn.image.load.2darraymsaa.a16.ll | 6 +- .../llvm.amdgcn.image.load.2darraymsaa.ll | 6 +- .../llvm.amdgcn.image.load.3d.a16.ll | 6 +- .../GlobalISel/llvm.amdgcn.image.load.3d.ll | 6 +- .../llvm.amdgcn.image.sample.g16.ll | 22 +- .../GlobalISel/llvm.amdgcn.rsq.clamp.ll | 48 +- .../AMDGPU/GlobalISel/load-constant.96.ll | 140 ++- .../CodeGen/AMDGPU/GlobalISel/mubuf-global.ll | 84 +- llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 66 +- llvm/test/CodeGen/AMDGPU/add.ll | 42 +- .../AMDGPU/atomic_optimizations_buffer.ll | 92 +- .../atomic_optimizations_global_pointer.ll | 128 +-- .../AMDGPU/atomic_optimizations_raw_buffer.ll | 80 +- .../atomic_optimizations_struct_buffer.ll | 96 +- llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll | 45 +- .../AMDGPU/cgp-addressing-modes-smem.ll | 12 +- llvm/test/CodeGen/AMDGPU/clamp.ll | 184 +-- .../fast-unaligned-load-store.global.ll | 30 +- .../fast-unaligned-load-store.private.ll | 84 +- .../CodeGen/AMDGPU/flat-scratch-i8-i16.ll | 72 +- llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll | 252 ++-- llvm/test/CodeGen/AMDGPU/flat-scratch.ll | 648 ++++++----- llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll | 531 ++++----- llvm/test/CodeGen/AMDGPU/fmaximum.ll | 20 +- llvm/test/CodeGen/AMDGPU/fminimum.ll | 20 +- .../AMDGPU/fp-min-max-num-flat-atomics.ll | 8 +- .../AMDGPU/fp-min-max-num-global-atomics.ll | 4 +- llvm/test/CodeGen/AMDGPU/global-saddr-load.ll | 376 +++--- .../test/CodeGen/AMDGPU/global-saddr-store.ll | 10 +- .../test/CodeGen/AMDGPU/global_atomics_i64.ll | 422 +++---- ...vm.amdgcn.global.atomic.ordered.add.b64.ll | 12 +- .../AMDGPU/llvm.amdgcn.image.a16.dim.ll | 54 +- .../AMDGPU/llvm.amdgcn.image.a16.encode.ll | 54 +- .../CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll | 108 +- .../llvm.amdgcn.image.gather4.a16.dim.ll | 28 +- .../AMDGPU/llvm.amdgcn.image.getlod.dim.ll | 14 +- .../AMDGPU/llvm.amdgcn.image.msaa.load.ll | 26 +- .../llvm.amdgcn.image.sample.a16.dim.ll | 78 +- .../llvm.amdgcn.image.sample.d16.dim.ll | 16 +- .../AMDGPU/llvm.amdgcn.image.sample.dim.ll | 124 +- .../llvm.amdgcn.image.sample.g16.encode.ll | 22 +- .../AMDGPU/llvm.amdgcn.image.sample.g16.ll | 22 +- .../AMDGPU/llvm.amdgcn.lds.direct.load.ll | 10 +- .../AMDGPU/llvm.amdgcn.lds.param.load.ll | 14 +- .../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll | 76 +- .../AMDGPU/llvm.amdgcn.permlane16.var.ll | 104 +- .../AMDGPU/llvm.amdgcn.raw.buffer.load.ll | 86 +- .../llvm.amdgcn.raw.tbuffer.load.d16.ll | 8 +- .../AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll | 24 +- .../llvm.amdgcn.raw.tbuffer.store.d16.ll | 10 +- .../CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll | 12 +- .../AMDGPU/llvm.amdgcn.s.barrier.wait.ll | 218 ++-- .../AMDGPU/llvm.amdgcn.s.buffer.load.ll | 72 +- .../CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll | 14 +- .../llvm.amdgcn.struct.buffer.load.format.ll | 52 +- ....amdgcn.struct.buffer.load.format.v3f16.ll | 2 +- .../llvm.amdgcn.struct.tbuffer.load.d16.ll | 8 +- .../AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll | 30 +- .../llvm.amdgcn.struct.tbuffer.store.d16.ll | 10 +- .../llvm.amdgcn.struct.tbuffer.store.ll | 2 +- llvm/test/CodeGen/AMDGPU/load-constant-f32.ll | 4 +- llvm/test/CodeGen/AMDGPU/load-constant-f64.ll | 8 +- llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 176 +-- llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 158 +-- llvm/test/CodeGen/AMDGPU/load-constant-i32.ll | 100 +- llvm/test/CodeGen/AMDGPU/load-constant-i64.ll | 24 +- llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 208 ++-- .../test/CodeGen/AMDGPU/loop-prefetch-data.ll | 24 +- .../memory-legalizer-private-nontemporal.ll | 48 +- .../memory-legalizer-private-volatile.ll | 56 +- llvm/test/CodeGen/AMDGPU/mul.ll | 72 +- llvm/test/CodeGen/AMDGPU/offset-split-flat.ll | 566 +++++---- .../CodeGen/AMDGPU/offset-split-global.ll | 566 +++++---- llvm/test/CodeGen/AMDGPU/readcyclecounter.ll | 4 +- llvm/test/CodeGen/AMDGPU/sub.ll | 54 +- .../CodeGen/AMDGPU/waitcnt-global-inv-wb.mir | 2 +- 101 files changed, 5599 insertions(+), 3722 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index a7d8ff0242b801..7ca7722a5cebd1 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -1242,7 +1242,8 @@ bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { case AMDGPU::S_WAITCNT: { const int64_t Imm = MI.getOperand(0).getImm(); AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm); - return (Decoded.LgkmCnt == 0); + // DsCnt corresponds to LGKMCnt here. + return (Decoded.DsCnt == 0); } default: // SOPP instructions cannot mitigate the hazard. diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index f6f37f5170a403..f3d38c018d2709 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1175,6 +1175,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasRestrictedSOffset() const { return HasRestrictedSOffset; } + /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt + /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively. + bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; } + /// Return the maximum number of waves per SIMD for kernels using \p SGPRs /// SGPRs unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 1cb1d32707f2d7..37890881d57544 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -57,7 +57,18 @@ namespace { // associated with the operand. Used for determining whether // s_waitcnt instruction needs to be emitted. -enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS }; +enum InstCounterType { + LOAD_CNT = 0, // VMcnt prior to gfx12. + DS_CNT, // LKGMcnt prior to gfx12. + EXP_CNT, // + STORE_CNT, // VScnt in gfx10/gfx11. + NUM_NORMAL_INST_CNTS, + SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only. + BVH_CNT, // gfx12+ only. + KM_CNT, // gfx12+ only. + NUM_EXTENDED_INST_CNTS, + NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS +}; } // namespace namespace llvm { @@ -67,15 +78,23 @@ template <> struct enum_iteration_traits { } // namespace llvm namespace { -auto inst_counter_types() { return enum_seq(VM_CNT, NUM_INST_CNTS); } +// Return an iterator over all counters between LOAD_CNT (the first counter) +// and \c MaxCounter (exclusive, default value yields an enumeration over +// all counters). +auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) { + return enum_seq(LOAD_CNT, MaxCounter); +} using RegInterval = std::pair; struct HardwareLimits { - unsigned VmcntMax; + unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12. unsigned ExpcntMax; - unsigned LgkmcntMax; - unsigned VscntMax; + unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12. + unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11. + unsigned SamplecntMax; // gfx12+ only. + unsigned BvhcntMax; // gfx12+ only. + unsigned KmcntMax; // gfx12+ only. }; struct RegisterEncoding { @@ -86,31 +105,25 @@ struct RegisterEncoding { }; enum WaitEventType { - VMEM_ACCESS, // vector-memory read & write - VMEM_READ_ACCESS, // vector-memory read - VMEM_WRITE_ACCESS, // vector-memory write that is not scratch - SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch - LDS_ACCESS, // lds read & write - GDS_ACCESS, // gds read & write - SQ_MESSAGE, // send message - SMEM_ACCESS, // scalar-memory read & write - EXP_GPR_LOCK, // export holding on its data src - GDS_GPR_LOCK, // GDS holding on its data and addr src - EXP_POS_ACCESS, // write to export position - EXP_PARAM_ACCESS, // write to export parameter - VMW_GPR_LOCK, // vector-memory write holding on its data src - EXP_LDS_ACCESS, // read by ldsdir counting as export + VMEM_ACCESS, // vector-memory read & write + VMEM_READ_ACCESS, // vector-memory read + VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only) + VMEM_BVH_READ_ACCESS, // vector-memory BVH read (gfx12+ only) + VMEM_WRITE_ACCESS, // vector-memory write that is not scratch + SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch + LDS_ACCESS, // lds read & write + GDS_ACCESS, // gds read & write + SQ_MESSAGE, // send message + SMEM_ACCESS, // scalar-memory read & write + EXP_GPR_LOCK, // export holding on its data src + GDS_GPR_LOCK, // GDS holding on its data and addr src + EXP_POS_ACCESS, // write to export position + EXP_PARAM_ACCESS, // write to export parameter + VMW_GPR_LOCK, // vector-memory write holding on its data src + EXP_LDS_ACCESS, // read by ldsdir counting as export NUM_WAIT_EVENTS, }; -static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = { - (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS), - (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) | - (1 << SQ_MESSAGE), - (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) | - (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) | (1 << EXP_LDS_ACCESS), - (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS)}; - // The mapping is: // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots @@ -137,17 +150,33 @@ enum VmemType { // MIMG instructions with a sampler. VMEM_SAMPLER, // BVH instructions - VMEM_BVH + VMEM_BVH, + NUM_VMEM_TYPES }; +// Maps values of InstCounterType to the instruction that waits on that +// counter. Only used if GCNSubtarget::hasExtendedWaitCounts() +// returns true. +static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = { + AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT, + AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT, + AMDGPU::S_WAIT_KMCNT}; + static bool updateVMCntOnly(const MachineInstr &Inst) { return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) || SIInstrInfo::isFLATScratch(Inst); } +#ifndef NDEBUG +static bool isNormalMode(InstCounterType MaxCounter) { + return MaxCounter == NUM_NORMAL_INST_CNTS; +} +#endif // NDEBUG + VmemType getVmemType(const MachineInstr &Inst) { assert(updateVMCntOnly(Inst)); - if (!SIInstrInfo::isMIMG(Inst)) + if (!SIInstrInfo::isMIMG(Inst) && !SIInstrInfo::isVIMAGE(Inst) && + !SIInstrInfo::isVSAMPLE(Inst)) return VMEM_NOSAMPLER; const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode()); const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo = @@ -156,25 +185,49 @@ VmemType getVmemType(const MachineInstr &Inst) { : BaseInfo->Sampler ? VMEM_SAMPLER : VMEM_NOSAMPLER; } -void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) { +unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) { switch (T) { - case VM_CNT: - Wait.VmCnt = std::min(Wait.VmCnt, Count); - break; + case LOAD_CNT: + return Wait.LoadCnt; case EXP_CNT: - Wait.ExpCnt = std::min(Wait.ExpCnt, Count); - break; - case LGKM_CNT: - Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count); - break; - case VS_CNT: - Wait.VsCnt = std::min(Wait.VsCnt, Count); - break; + return Wait.ExpCnt; + case DS_CNT: + return Wait.DsCnt; + case STORE_CNT: + return Wait.StoreCnt; + case SAMPLE_CNT: + return Wait.SampleCnt; + case BVH_CNT: + return Wait.BvhCnt; + case KM_CNT: + return Wait.KmCnt; default: llvm_unreachable("bad InstCounterType"); } } +void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) { + unsigned &WC = getCounterRef(Wait, T); + WC = std::min(WC, Count); +} + +void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) { + getCounterRef(Wait, T) = ~0u; +} + +unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) { + return getCounterRef(Wait, T); +} + +// Mapping from event to counter according to the table masks. +InstCounterType eventCounter(const unsigned *masks, WaitEventType E) { + for (auto T : inst_counter_types()) { + if (masks[T] & (1 << E)) + return T; + } + llvm_unreachable("event type has no associated counter"); +} + // This objects maintains the current score brackets of each wait counter, and // a per-register scoreboard for each wait counter. // @@ -185,20 +238,30 @@ void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) { // "s_waitcnt 0" before use. class WaitcntBrackets { public: - WaitcntBrackets(const GCNSubtarget *SubTarget, HardwareLimits Limits, - RegisterEncoding Encoding) - : ST(SubTarget), Limits(Limits), Encoding(Encoding) {} + WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter, + HardwareLimits Limits, RegisterEncoding Encoding, + const unsigned *WaitEventMaskForInst, + InstCounterType SmemAccessCounter) + : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits), + Encoding(Encoding), WaitEventMaskForInst(WaitEventMaskForInst), + SmemAccessCounter(SmemAccessCounter) {} unsigned getWaitCountMax(InstCounterType T) const { switch (T) { - case VM_CNT: - return Limits.VmcntMax; - case LGKM_CNT: - return Limits.LgkmcntMax; + case LOAD_CNT: + return Limits.LoadcntMax; + case DS_CNT: + return Limits.DscntMax; case EXP_CNT: return Limits.ExpcntMax; - case VS_CNT: - return Limits.VscntMax; + case STORE_CNT: + return Limits.StorecntMax; + case SAMPLE_CNT: + return Limits.SamplecntMax; + case BVH_CNT: + return Limits.BvhcntMax; + case KM_CNT: + return Limits.KmcntMax; default: break; } @@ -219,20 +282,11 @@ class WaitcntBrackets { return getScoreUB(T) - getScoreLB(T); } - // Mapping from event to counter. - InstCounterType eventCounter(WaitEventType E) const { - for (auto T : inst_counter_types()) { - if (WaitEventMaskForInst[T] & (1 << E)) - return T; - } - llvm_unreachable("event type has no associated counter"); - } - unsigned getRegScore(int GprNo, InstCounterType T) const { if (GprNo < NUM_ALL_VGPRS) { return VgprScores[T][GprNo]; } - assert(T == LGKM_CNT); + assert(T == SmemAccessCounter); return SgprScores[GprNo - NUM_ALL_VGPRS]; } @@ -269,15 +323,15 @@ class WaitcntBrackets { } bool hasPendingFlat() const { - return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] && - LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) || - (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] && - LastFlat[VM_CNT] <= ScoreUBs[VM_CNT])); + return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] && + LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) || + (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] && + LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT])); } void setPendingFlat() { - LastFlat[VM_CNT] = ScoreUBs[VM_CNT]; - LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT]; + LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT]; + LastFlat[DS_CNT] = ScoreUBs[DS_CNT]; } // Return true if there might be pending writes to the specified vgpr by VMEM @@ -293,8 +347,8 @@ class WaitcntBrackets { } void setNonKernelFunctionInitialState() { - setScoreUB(VS_CNT, getWaitCountMax(VS_CNT)); - PendingEvents |= WaitEventMaskForInst[VS_CNT]; + setScoreUB(STORE_CNT, getWaitCountMax(STORE_CNT)); + PendingEvents |= WaitEventMaskForInst[STORE_CNT]; } void print(raw_ostream &); @@ -331,7 +385,7 @@ class WaitcntBrackets { VgprUB = std::max(VgprUB, GprNo); VgprScores[T][GprNo] = Val; } else { - assert(T == LGKM_CNT); + assert(T == SmemAccessCounter); SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS); SgprScores[GprNo - NUM_ALL_VGPRS] = Val; } @@ -342,8 +396,11 @@ class WaitcntBrackets { unsigned OpNo, unsigned Val); const GCNSubtarget *ST = nullptr; + InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS; HardwareLimits Limits = {}; RegisterEncoding Encoding = {}; + const unsigned *WaitEventMaskForInst; + InstCounterType SmemAccessCounter; unsigned ScoreLBs[NUM_INST_CNTS] = {0}; unsigned ScoreUBs[NUM_INST_CNTS] = {0}; unsigned PendingEvents = 0; @@ -354,20 +411,139 @@ class WaitcntBrackets { int VgprUB = -1; int SgprUB = -1; unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}}; - // Wait cnt scores for every sgpr, only lgkmcnt is relevant. + // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt + // pre-gfx12) or KM_CNT (gfx12+ only) are relevant. unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0}; // Bitmask of the VmemTypes of VMEM instructions that might have a pending // write to each vgpr. unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0}; }; +// This abstracts the logic for generating and updating S_WAIT* instructions +// away from the analysis that determines where they are needed. This was +// done because the set of counters and instructions for waiting on them +// underwent a major shift with gfx12, sufficiently so that having this +// abstraction allows the main analysis logic to be simpler than it would +// otherwise have had to become. +class WaitcntGenerator { +protected: + const GCNSubtarget *ST = nullptr; + const SIInstrInfo *TII = nullptr; + AMDGPU::IsaVersion IV; + InstCounterType MaxCounter; + +public: + WaitcntGenerator() {} + WaitcntGenerator(const GCNSubtarget *ST, InstCounterType MaxCounter) + : ST(ST), TII(ST->getInstrInfo()), + IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter) {} + + // Edits an existing sequence of wait count instructions according + // to an incoming Waitcnt value, which is itself updated to reflect + // any new wait count instructions which may need to be generated by + // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits + // were made. + // + // This editing will usually be merely updated operands, but it may also + // delete instructions if the incoming Wait value indicates they are not + // needed. It may also remove existing instructions for which a wait + // is needed if it can be determined that it is better to generate new + // instructions later, as can happen on gfx12. + virtual bool + applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, + MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, + MachineBasicBlock::instr_iterator It) const = 0; + + // Transform a soft waitcnt into a normal one. + bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const; + + // Generates new wait count instructions according to the value of + // Wait, returning true if any new instructions were created. + virtual bool createNewWaitcnt(MachineBasicBlock &Block, + MachineBasicBlock::instr_iterator It, + AMDGPU::Waitcnt Wait) = 0; + + // Returns an array of bit masks which can be used to map values in + // WaitEventType to corresponding counter values in InstCounterType. + virtual const unsigned *getWaitEventMask() const = 0; + + virtual ~WaitcntGenerator() = default; +}; + +class WaitcntGeneratorPreGFX12 : public WaitcntGenerator { +public: + WaitcntGeneratorPreGFX12() {} + WaitcntGeneratorPreGFX12(const GCNSubtarget *ST) + : WaitcntGenerator(ST, NUM_NORMAL_INST_CNTS) {} + + bool + applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, + MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, + MachineBasicBlock::instr_iterator It) const override; + + bool createNewWaitcnt(MachineBasicBlock &Block, + MachineBasicBlock::instr_iterator It, + AMDGPU::Waitcnt Wait) override; + + const unsigned *getWaitEventMask() const override { + assert(ST); + + static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = { + (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS) | + (1 << VMEM_SAMPLER_READ_ACCESS) | (1 << VMEM_BVH_READ_ACCESS), + (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) | + (1 << SQ_MESSAGE), + (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) | + (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) | + (1 << EXP_LDS_ACCESS), + (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS), + 0, + 0, + 0}; + + return WaitEventMaskForInstPreGFX12; + } +}; + +class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { +public: + WaitcntGeneratorGFX12Plus() {} + WaitcntGeneratorGFX12Plus(const GCNSubtarget *ST, InstCounterType MaxCounter) + : WaitcntGenerator(ST, MaxCounter) {} + + bool + applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, + MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, + MachineBasicBlock::instr_iterator It) const override; + + bool createNewWaitcnt(MachineBasicBlock &Block, + MachineBasicBlock::instr_iterator It, + AMDGPU::Waitcnt Wait) override; + + const unsigned *getWaitEventMask() const override { + assert(ST); + + static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = { + (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS), + (1 << LDS_ACCESS) | (1 << GDS_ACCESS), + (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) | + (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) | + (1 << EXP_LDS_ACCESS), + (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS), + (1 << VMEM_SAMPLER_READ_ACCESS), + (1 << VMEM_BVH_READ_ACCESS), + (1 << SMEM_ACCESS) | (1 << SQ_MESSAGE)}; + + return WaitEventMaskForInstGFX12Plus; + } +}; + class SIInsertWaitcnts : public MachineFunctionPass { private: const GCNSubtarget *ST = nullptr; const SIInstrInfo *TII = nullptr; const SIRegisterInfo *TRI = nullptr; const MachineRegisterInfo *MRI = nullptr; - AMDGPU::IsaVersion IV; DenseMap SLoadAddresses; DenseMap PreheadersToFlush; @@ -379,6 +555,8 @@ class SIInsertWaitcnts : public MachineFunctionPass { bool Dirty = true; }; + InstCounterType SmemAccessCounter; + MapVector BlockInfos; // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0 @@ -388,10 +566,20 @@ class SIInsertWaitcnts : public MachineFunctionPass { bool OptNone; + // In any given run of this pass, WCG will point to one of these two + // generator objects, which must have been re-initialised before use + // from a value made using a subtarget constructor. + WaitcntGeneratorPreGFX12 WCGPreGFX12; + WaitcntGeneratorGFX12Plus WCGGFX12Plus; + + WaitcntGenerator *WCG = nullptr; + // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS // message. DenseSet ReleaseVGPRInsts; + InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS; + public: static char ID; @@ -438,16 +626,22 @@ class SIInsertWaitcnts : public MachineFunctionPass { if (DebugCounter::isCounterSet(ForceLgkmCounter) && DebugCounter::shouldExecute(ForceLgkmCounter)) { - ForceEmitWaitcnt[LGKM_CNT] = true; + ForceEmitWaitcnt[DS_CNT] = true; + ForceEmitWaitcnt[KM_CNT] = true; } else { - ForceEmitWaitcnt[LGKM_CNT] = false; + ForceEmitWaitcnt[DS_CNT] = false; + ForceEmitWaitcnt[KM_CNT] = false; } if (DebugCounter::isCounterSet(ForceVMCounter) && DebugCounter::shouldExecute(ForceVMCounter)) { - ForceEmitWaitcnt[VM_CNT] = true; + ForceEmitWaitcnt[LOAD_CNT] = true; + ForceEmitWaitcnt[SAMPLE_CNT] = true; + ForceEmitWaitcnt[BVH_CNT] = true; } else { - ForceEmitWaitcnt[VM_CNT] = false; + ForceEmitWaitcnt[LOAD_CNT] = false; + ForceEmitWaitcnt[SAMPLE_CNT] = false; + ForceEmitWaitcnt[BVH_CNT] = false; } #endif // NDEBUG } @@ -455,6 +649,10 @@ class SIInsertWaitcnts : public MachineFunctionPass { // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or // FLAT instruction. WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const { + // Maps VMEM access types to their corresponding WaitEventType. + static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = { + VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}; + assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst)); // LDS DMA loads are also stores, but on the LDS side. On the VMEM side // these should use VM_CNT. @@ -467,7 +665,9 @@ class SIInsertWaitcnts : public MachineFunctionPass { return SCRATCH_WRITE_ACCESS; return VMEM_WRITE_ACCESS; } - return VMEM_READ_ACCESS; + if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst)) + return VMEM_READ_ACCESS; + return VmemReadMapping[getVmemType(Inst)]; } bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const; @@ -488,13 +688,6 @@ class SIInsertWaitcnts : public MachineFunctionPass { WaitcntBrackets *ScoreBrackets); bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets); - bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, - MachineInstr &OldWaitcntInstr, - AMDGPU::Waitcnt &Wait, - MachineBasicBlock::instr_iterator It) const; - - // Transform a soft waitcnt into a normal one. - bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const; }; } // end anonymous namespace @@ -556,8 +749,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, WaitEventType E, MachineInstr &Inst) { - InstCounterType T = eventCounter(E); - unsigned CurrScore = getScoreUB(T) + 1; + InstCounterType T = eventCounter(WaitEventMaskForInst, E); + + unsigned UB = getScoreUB(T); + unsigned CurrScore = UB + 1; if (CurrScore == 0) report_fatal_error("InsertWaitcnt score wraparound"); // PendingEvents and ScoreUB need to be update regardless if this event @@ -686,7 +881,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, if (!Op.isReg() || !Op.isDef()) continue; RegInterval Interval = getRegInterval(&Inst, MRI, TRI, I); - if (T == VM_CNT) { + if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) { if (Interval.first >= NUM_ALL_VGPRS) continue; if (updateVMCntOnly(Inst)) { @@ -714,21 +909,33 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, void WaitcntBrackets::print(raw_ostream &OS) { OS << '\n'; - for (auto T : inst_counter_types()) { + for (auto T : inst_counter_types(MaxCounter)) { unsigned SR = getScoreRange(T); switch (T) { - case VM_CNT: - OS << " VM_CNT(" << SR << "): "; + case LOAD_CNT: + OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT(" + << SR << "): "; break; - case LGKM_CNT: - OS << " LGKM_CNT(" << SR << "): "; + case DS_CNT: + OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT(" + << SR << "): "; break; case EXP_CNT: OS << " EXP_CNT(" << SR << "): "; break; - case VS_CNT: - OS << " VS_CNT(" << SR << "): "; + case STORE_CNT: + OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT(" + << SR << "): "; + break; + case SAMPLE_CNT: + OS << " SAMPLE_CNT(" << SR << "): "; + break; + case BVH_CNT: + OS << " BVH_CNT(" << SR << "): "; + break; + case KM_CNT: + OS << " KM_CNT(" << SR << "): "; break; default: OS << " UNKNOWN(" << SR << "): "; @@ -751,9 +958,9 @@ void WaitcntBrackets::print(raw_ostream &OS) { } } // Also need to print sgpr scores for lgkm_cnt. - if (T == LGKM_CNT) { + if (T == SmemAccessCounter) { for (int J = 0; J <= SgprUB; J++) { - unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT); + unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T); if (RegScore <= LB) continue; unsigned RelScore = RegScore - LB - 1; @@ -769,10 +976,13 @@ void WaitcntBrackets::print(raw_ostream &OS) { /// Simplify the waitcnt, in the sense of removing redundant counts, and return /// whether a waitcnt instruction is needed at all. void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { - simplifyWaitcnt(VM_CNT, Wait.VmCnt); + simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt); simplifyWaitcnt(EXP_CNT, Wait.ExpCnt); - simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt); - simplifyWaitcnt(VS_CNT, Wait.VsCnt); + simplifyWaitcnt(DS_CNT, Wait.DsCnt); + simplifyWaitcnt(STORE_CNT, Wait.StoreCnt); + simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt); + simplifyWaitcnt(BVH_CNT, Wait.BvhCnt); + simplifyWaitcnt(KM_CNT, Wait.KmCnt); } void WaitcntBrackets::simplifyWaitcnt(InstCounterType T, @@ -793,8 +1003,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, int RegNo, const unsigned LB = getScoreLB(T); const unsigned UB = getScoreUB(T); if ((UB >= ScoreToWait) && (ScoreToWait > LB)) { - if ((T == VM_CNT || T == LGKM_CNT) && - hasPendingFlat() && + if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() && !ST->hasFlatLgkmVMemCountInOrder()) { // If there is a pending FLAT operation, and this is a VMem or LGKM // waitcnt and the target can report early completion, then we need @@ -815,10 +1024,13 @@ void WaitcntBrackets::determineWait(InstCounterType T, int RegNo, } void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) { - applyWaitcnt(VM_CNT, Wait.VmCnt); + applyWaitcnt(LOAD_CNT, Wait.LoadCnt); applyWaitcnt(EXP_CNT, Wait.ExpCnt); - applyWaitcnt(LGKM_CNT, Wait.LgkmCnt); - applyWaitcnt(VS_CNT, Wait.VsCnt); + applyWaitcnt(DS_CNT, Wait.DsCnt); + applyWaitcnt(STORE_CNT, Wait.StoreCnt); + applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt); + applyWaitcnt(BVH_CNT, Wait.BvhCnt); + applyWaitcnt(KM_CNT, Wait.KmCnt); } void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { @@ -839,7 +1051,7 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { // the decrement may go out of order. bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { // Scalar memory read always can go out of order. - if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS)) + if (T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) return true; return hasMixedPendingEvents(T); } @@ -873,22 +1085,49 @@ static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName, return true; } -bool SIInsertWaitcnts::promoteSoftWaitCnt(MachineInstr *Waitcnt) const { - unsigned Opcode = Waitcnt->getOpcode(); - if (!SIInstrInfo::isSoftWaitcnt(Opcode)) +/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction, +/// and if so, which counter it is waiting on. +static std::optional counterTypeForInstr(unsigned Opcode) { + switch (Opcode) { + case AMDGPU::S_WAIT_LOADCNT: + return LOAD_CNT; + case AMDGPU::S_WAIT_EXPCNT: + return EXP_CNT; + case AMDGPU::S_WAIT_STORECNT: + return STORE_CNT; + case AMDGPU::S_WAIT_SAMPLECNT: + return SAMPLE_CNT; + case AMDGPU::S_WAIT_BVHCNT: + return BVH_CNT; + case AMDGPU::S_WAIT_DSCNT: + return DS_CNT; + case AMDGPU::S_WAIT_KMCNT: + return KM_CNT; + default: + return {}; + } +} + +bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const { + unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode()); + if (Opcode == Waitcnt->getOpcode()) return false; - Waitcnt->setDesc(TII->get(SIInstrInfo::getNonSoftWaitcntOpcode(Opcode))); + Waitcnt->setDesc(TII->get(Opcode)); return true; } -/// Combine consecutive waitcnt instructions that precede \p It and follow -/// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added -/// by previous passes. Currently this pass conservatively assumes that these -/// preexisting waitcnt are required for correctness. -bool SIInsertWaitcnts::applyPreexistingWaitcnt( +/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that +/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits +/// from \p Wait that were added by previous passes. Currently this pass +/// conservatively assumes that these preexisting waits are required for +/// correctness. +bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const { + assert(ST); + assert(isNormalMode(MaxCounter)); + bool Modified = false; MachineInstr *WaitcntInstr = nullptr; MachineInstr *WaitcntVsCntInstr = nullptr; @@ -898,12 +1137,12 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt( if (II.isMetaInstruction()) continue; - unsigned Opcode = II.getOpcode(); - bool IsSoft = SIInstrInfo::isSoftWaitcnt(Opcode); + unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode()); + bool IsSoft = Opcode != II.getOpcode(); - if (SIInstrInfo::isWaitcnt(Opcode)) { - // Update required wait count. If this is a soft waitcnt (= it was added - // by an earlier pass), it may be entirely removed. + // Update required wait count. If this is a soft waitcnt (= it was added + // by an earlier pass), it may be entirely removed. + if (Opcode == AMDGPU::S_WAITCNT) { unsigned IEnc = II.getOperand(0).getImm(); AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc); if (IsSoft) @@ -911,23 +1150,22 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt( Wait = Wait.combined(OldWait); // Merge consecutive waitcnt of the same type by erasing multiples. - if (WaitcntInstr || (!Wait.hasWaitExceptVsCnt() && IsSoft)) { + if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && IsSoft)) { II.eraseFromParent(); Modified = true; } else WaitcntInstr = &II; - } else { - assert(SIInstrInfo::isWaitcntVsCnt(Opcode)); + assert(Opcode == AMDGPU::S_WAITCNT_VSCNT); assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL); unsigned OldVSCnt = TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); if (IsSoft) - ScoreBrackets.simplifyWaitcnt(InstCounterType::VS_CNT, OldVSCnt); - Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt); + ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt); + Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt); - if (WaitcntVsCntInstr || (!Wait.hasWaitVsCnt() && IsSoft)) { + if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && IsSoft)) { II.eraseFromParent(); Modified = true; } else @@ -935,18 +1173,19 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt( } } - // Updated encoding of merged waitcnt with the required wait. if (WaitcntInstr) { Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16, AMDGPU::encodeWaitcnt(IV, Wait)); Modified |= promoteSoftWaitCnt(WaitcntInstr); - ScoreBrackets.applyWaitcnt(Wait); - Wait.VmCnt = ~0u; - Wait.LgkmCnt = ~0u; + ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt); + ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt); + ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt); + Wait.LoadCnt = ~0u; Wait.ExpCnt = ~0u; + Wait.DsCnt = ~0u; - LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() + LLVM_DEBUG(It == WaitcntInstr->getParent()->end() ? dbgs() << "applyPreexistingWaitcnt\n" << "New Instr at block end: " << *WaitcntInstr << '\n' @@ -957,12 +1196,13 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt( if (WaitcntVsCntInstr) { Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr, - AMDGPU::OpName::simm16, Wait.VsCnt); + AMDGPU::OpName::simm16, Wait.StoreCnt); Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr); - ScoreBrackets.applyWaitcnt(Wait); - Wait.VsCnt = ~0u; - LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() + ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt); + Wait.StoreCnt = ~0u; + + LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end() ? dbgs() << "applyPreexistingWaitcnt\n" << "New Instr at block end: " << *WaitcntVsCntInstr << '\n' @@ -974,6 +1214,293 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt( return Modified; } +/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any +/// required counters in \p Wait +bool WaitcntGeneratorPreGFX12::createNewWaitcnt( + MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, + AMDGPU::Waitcnt Wait) { + assert(ST); + assert(isNormalMode(MaxCounter)); + + bool Modified = false; + const DebugLoc &DL = Block.findDebugLoc(It); + + // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a + // single instruction while VScnt has its own instruction. + if (Wait.hasWaitExceptStoreCnt()) { + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + [[maybe_unused]] auto SWaitInst = + BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); + Modified = true; + + LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; + if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; + dbgs() << "New Instr: " << *SWaitInst << '\n'); + } + + if (Wait.hasWaitStoreCnt()) { + assert(ST->hasVscnt()); + + [[maybe_unused]] auto SWaitInst = + BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(Wait.StoreCnt); + Modified = true; + + LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; + if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; + dbgs() << "New Instr: " << *SWaitInst << '\n'); + } + + return Modified; +} + +/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and +/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that +/// were added by previous passes. Currently this pass conservatively +/// assumes that these preexisting waits are required for correctness. +bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( + WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr, + AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const { + assert(ST); + assert(!isNormalMode(MaxCounter)); + + bool Modified = false; + MachineInstr *CombinedLoadDsCntInstr = nullptr; + MachineInstr *CombinedStoreDsCntInstr = nullptr; + MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {}; + + for (auto &II : + make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) { + if (II.isMetaInstruction()) + continue; + + MachineInstr **UpdatableInstr; + + // Update required wait count. If this is a soft waitcnt (= it was added + // by an earlier pass), it may be entirely removed. + + unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode()); + bool IsSoft = Opcode != II.getOpcode(); + + if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) { + unsigned OldEnc = + TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); + AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc); + if (IsSoft) + ScoreBrackets.simplifyWaitcnt(OldWait); + Wait = Wait.combined(OldWait); + UpdatableInstr = &CombinedLoadDsCntInstr; + } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) { + unsigned OldEnc = + TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); + AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc); + if (IsSoft) + ScoreBrackets.simplifyWaitcnt(OldWait); + Wait = Wait.combined(OldWait); + UpdatableInstr = &CombinedStoreDsCntInstr; + } else { + std::optional CT = counterTypeForInstr(Opcode); + assert(CT.has_value()); + unsigned OldCnt = + TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); + if (IsSoft) + ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt); + addWait(Wait, CT.value(), OldCnt); + UpdatableInstr = &WaitInstrs[CT.value()]; + } + + // Merge consecutive waitcnt of the same type by erasing multiples. + if (!*UpdatableInstr) { + *UpdatableInstr = &II; + } else { + II.eraseFromParent(); + Modified = true; + } + } + + if (CombinedLoadDsCntInstr) { + // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need + // to be waited for. Otherwise, let the instruction be deleted so + // the appropriate single counter wait instruction can be inserted + // instead, when new S_WAIT_*CNT instructions are inserted by + // createNewWaitcnt(). As a side effect, resetting the wait counts will + // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by + // the loop below that deals with single counter instructions. + if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) { + unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait); + Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr, + AMDGPU::OpName::simm16, NewEnc); + Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr); + ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt); + ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt); + Wait.LoadCnt = ~0u; + Wait.DsCnt = ~0u; + + LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() + ? dbgs() << "applyPreexistingWaitcnt\n" + << "New Instr at block end: " + << *CombinedLoadDsCntInstr << '\n' + : dbgs() << "applyPreexistingWaitcnt\n" + << "Old Instr: " << *It << "New Instr: " + << *CombinedLoadDsCntInstr << '\n'); + } else { + CombinedLoadDsCntInstr->eraseFromParent(); + Modified = true; + } + } + + if (CombinedStoreDsCntInstr) { + // Similarly for S_WAIT_STORECNT_DSCNT. + if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) { + unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait); + Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr, + AMDGPU::OpName::simm16, NewEnc); + Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr); + ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt); + ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt); + Wait.StoreCnt = ~0u; + Wait.DsCnt = ~0u; + + LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() + ? dbgs() << "applyPreexistingWaitcnt\n" + << "New Instr at block end: " + << *CombinedStoreDsCntInstr << '\n' + : dbgs() << "applyPreexistingWaitcnt\n" + << "Old Instr: " << *It << "New Instr: " + << *CombinedStoreDsCntInstr << '\n'); + } else { + CombinedStoreDsCntInstr->eraseFromParent(); + Modified = true; + } + } + + // Look for an opportunity to convert existing S_WAIT_LOADCNT, + // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT + // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing + // instructions so that createNewWaitcnt() will create new combined + // instructions to replace them. + + if (Wait.DsCnt != ~0u) { + // This is a vector of addresses in WaitInstrs pointing to instructions + // that should be removed if they are present. + SmallVector WaitsToErase; + + // If it's known that both DScnt and either LOADcnt or STOREcnt (but not + // both) need to be waited for, ensure that there are no existing + // individual wait count instructions for these. + + if (Wait.LoadCnt != ~0u) { + WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]); + WaitsToErase.push_back(&WaitInstrs[DS_CNT]); + } else if (Wait.StoreCnt != ~0u) { + WaitsToErase.push_back(&WaitInstrs[STORE_CNT]); + WaitsToErase.push_back(&WaitInstrs[DS_CNT]); + } + + for (MachineInstr **WI : WaitsToErase) { + if (!*WI) + continue; + + (*WI)->eraseFromParent(); + *WI = nullptr; + Modified = true; + } + } + + for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) { + if (!WaitInstrs[CT]) + continue; + + unsigned NewCnt = getWait(Wait, CT); + if (NewCnt != ~0u) { + Modified |= updateOperandIfDifferent(*WaitInstrs[CT], + AMDGPU::OpName::simm16, NewCnt); + Modified |= promoteSoftWaitCnt(WaitInstrs[CT]); + + ScoreBrackets.applyWaitcnt(CT, NewCnt); + setNoWait(Wait, CT); + + LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() + ? dbgs() << "applyPreexistingWaitcnt\n" + << "New Instr at block end: " << *WaitInstrs[CT] + << '\n' + : dbgs() << "applyPreexistingWaitcnt\n" + << "Old Instr: " << *It + << "New Instr: " << *WaitInstrs[CT] << '\n'); + } else { + WaitInstrs[CT]->eraseFromParent(); + Modified = true; + } + } + + return Modified; +} + +/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait +bool WaitcntGeneratorGFX12Plus::createNewWaitcnt( + MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, + AMDGPU::Waitcnt Wait) { + assert(ST); + assert(!isNormalMode(MaxCounter)); + + bool Modified = false; + const DebugLoc &DL = Block.findDebugLoc(It); + + // Check for opportunities to use combined wait instructions. + if (Wait.DsCnt != ~0u) { + MachineInstr *SWaitInst = nullptr; + + if (Wait.LoadCnt != ~0u) { + unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait); + + SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT)) + .addImm(Enc); + + Wait.LoadCnt = ~0u; + Wait.DsCnt = ~0u; + } else if (Wait.StoreCnt != ~0u) { + unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait); + + SWaitInst = + BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT)) + .addImm(Enc); + + Wait.StoreCnt = ~0u; + Wait.DsCnt = ~0u; + } + + if (SWaitInst) { + Modified = true; + + LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; + if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; + dbgs() << "New Instr: " << *SWaitInst << '\n'); + } + } + + // Generate an instruction for any remaining counter that needs + // waiting for. + + for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) { + unsigned Count = getWait(Wait, CT); + if (Count == ~0u) + continue; + + auto SWaitInst = + BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT])) + .addImm(Count); + + Modified = true; + + LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; + if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; + dbgs() << "New Instr: " << *SWaitInst << '\n'); + } + + return Modified; +} + static bool readsVCCZ(const MachineInstr &MI) { unsigned Opc = MI.getOpcode(); return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) && @@ -1027,7 +1554,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL || MI.getOpcode() == AMDGPU::BUFFER_GL0_INV || MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) { - Wait.VmCnt = 0; + Wait.LoadCnt = 0; } // All waits must be resolved at call return. @@ -1037,16 +1564,19 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, MI.getOpcode() == AMDGPU::SI_RETURN || MI.getOpcode() == AMDGPU::S_SETPC_B64_return || (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) { - Wait = Wait.combined(AMDGPU::Waitcnt::allZeroExceptVsCnt()); + Wait = Wait.combined( + AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts())); } // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM // stores. In this case it can be useful to send a message to explicitly // release all VGPRs before the stores have completed, but it is only safe to - // do this if there are no outstanding scratch stores. + // do this if: + // * there are no outstanding scratch stores + // * we are not in Dynamic VGPR mode else if (MI.getOpcode() == AMDGPU::S_ENDPGM || MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) { if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !OptNone && - ScoreBrackets.getScoreRange(VS_CNT) != 0 && + ScoreBrackets.getScoreRange(STORE_CNT) != 0 && !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS)) ReleaseVGPRInsts.insert(&MI); } @@ -1056,7 +1586,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, ST->hasLegacyGeometry() && ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) == AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) { - Wait.VmCnt = 0; + Wait.LoadCnt = 0; } #if 0 // TODO: the following blocks of logic when we have fence. else if (MI.getOpcode() == SC_FENCE) { @@ -1073,12 +1603,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, case SCMEM_LDS: if (group_is_multi_wave || context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) { - EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT, - ScoreBrackets->getScoreUB(LGKM_CNT)); - // LDS may have to wait for VM_CNT after buffer load to LDS + EmitWaitcnt |= ScoreBrackets->updateByWait(DS_CNT, + ScoreBrackets->getScoreUB(DS_CNT)); + // LDS may have to wait for VMcnt after buffer load to LDS if (target_info->HasBufferLoadToLDS()) { - EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT, - ScoreBrackets->getScoreUB(VM_CNT)); + EmitWaitcnt |= ScoreBrackets->updateByWait(LOAD_CNT, + ScoreBrackets->getScoreUB(LOAD_CNT)); } } break; @@ -1087,8 +1617,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, if (group_is_multi_wave || fence_is_global) { EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); - EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT, - ScoreBrackets->getScoreUB(LGKM_CNT)); + EmitWaitcnt |= ScoreBrackets->updateByWait(DS_CNT, + ScoreBrackets->getScoreUB(DS_CNT)); } break; @@ -1099,8 +1629,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, if (group_is_multi_wave || fence_is_global) { EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); - EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT, - ScoreBrackets->getScoreUB(VM_CNT)); + EmitWaitcnt |= ScoreBrackets->updateByWait(LOAD_CNT, + ScoreBrackets->getScoreUB(LOAD_CNT)); } break; @@ -1143,7 +1673,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, for (int RegNo = CallAddrOpInterval.first; RegNo < CallAddrOpInterval.second; ++RegNo) - ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait); + ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait); int RtnAddrOpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); @@ -1153,7 +1683,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, for (int RegNo = RtnAddrOpInterval.first; RegNo < RtnAddrOpInterval.second; ++RegNo) - ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait); + ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait); } } } else { @@ -1170,10 +1700,11 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // instruction to guarantee the right WAW order. // 2) If a destination operand that was used by a recent export/store ins, // add s_waitcnt on exp_cnt to guarantee the WAR order. + for (const MachineMemOperand *Memop : MI.memoperands()) { const Value *Ptr = Memop->getValue(); if (Memop->isStore() && SLoadAddresses.count(Ptr)) { - addWait(Wait, LGKM_CNT, 0); + addWait(Wait, SmemAccessCounter, 0); if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second)) SLoadAddresses.erase(Ptr); } @@ -1184,8 +1715,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, if (TII->mayWriteLDSThroughDMA(MI)) continue; unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; - // VM_CNT is only relevant to vgpr or LDS. - ScoreBrackets.determineWait(VM_CNT, RegNo, Wait); + // LOAD_CNT is only relevant to vgpr or LDS. + ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait); if (Memop->isStore()) { ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait); } @@ -1213,14 +1744,18 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, if (Op.isUse() || !updateVMCntOnly(MI) || ScoreBrackets.hasOtherPendingVmemTypes(RegNo, getVmemType(MI))) { - ScoreBrackets.determineWait(VM_CNT, RegNo, Wait); + ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait); + ScoreBrackets.determineWait(SAMPLE_CNT, RegNo, Wait); + ScoreBrackets.determineWait(BVH_CNT, RegNo, Wait); ScoreBrackets.clearVgprVmemTypes(RegNo); } if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) { ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait); } + ScoreBrackets.determineWait(DS_CNT, RegNo, Wait); + } else { + ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait); } - ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait); } } } @@ -1232,7 +1767,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here. if (MI.getOpcode() == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) { - Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt())); + Wait = Wait.combined( + AMDGPU::Waitcnt::allZero(ST->hasExtendedWaitCounts(), ST->hasVscnt())); } // TODO: Remove this work-around, enable the assert for Bug 457939 @@ -1240,7 +1776,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // independent of target. if (readsVCCZ(MI) && ST->hasReadVCCZBug()) { if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { - Wait.LgkmCnt = 0; + Wait.DsCnt = 0; } } @@ -1248,35 +1784,54 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, ScoreBrackets.simplifyWaitcnt(Wait); if (ForceEmitZeroWaitcnts) - Wait = AMDGPU::Waitcnt::allZeroExceptVsCnt(); + Wait = AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts()); - if (ForceEmitWaitcnt[VM_CNT]) - Wait.VmCnt = 0; + if (ForceEmitWaitcnt[LOAD_CNT]) + Wait.LoadCnt = 0; if (ForceEmitWaitcnt[EXP_CNT]) Wait.ExpCnt = 0; - if (ForceEmitWaitcnt[LGKM_CNT]) - Wait.LgkmCnt = 0; + if (ForceEmitWaitcnt[DS_CNT]) + Wait.DsCnt = 0; + if (ForceEmitWaitcnt[SAMPLE_CNT]) + Wait.SampleCnt = 0; + if (ForceEmitWaitcnt[BVH_CNT]) + Wait.BvhCnt = 0; + if (ForceEmitWaitcnt[KM_CNT]) + Wait.KmCnt = 0; if (FlushVmCnt) { - if (ScoreBrackets.hasPendingEvent(VM_CNT)) - Wait.VmCnt = 0; + if (ScoreBrackets.hasPendingEvent(LOAD_CNT)) + Wait.LoadCnt = 0; + if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT)) + Wait.SampleCnt = 0; + if (ScoreBrackets.hasPendingEvent(BVH_CNT)) + Wait.BvhCnt = 0; } return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets, OldWaitcntInstr); } -// Add a waitcnt to flush the vmcnt counter at the end of the given block if -// needed. +// Add a waitcnt to flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the +// end of the given block if needed. bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets, MachineInstr *OldWaitcntInstr) { AMDGPU::Waitcnt Wait; - if (!ScoreBrackets.hasPendingEvent(VM_CNT)) + unsigned LoadCntPending = ScoreBrackets.hasPendingEvent(LOAD_CNT); + unsigned SampleCntPending = ScoreBrackets.hasPendingEvent(SAMPLE_CNT); + unsigned BvhCntPending = ScoreBrackets.hasPendingEvent(BVH_CNT); + + if (LoadCntPending == 0 && SampleCntPending == 0 && BvhCntPending == 0) return false; - Wait.VmCnt = 0; + if (LoadCntPending != 0) + Wait.LoadCnt = 0; + if (SampleCntPending != 0) + Wait.SampleCnt = 0; + if (BvhCntPending != 0) + Wait.BvhCnt = 0; return generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets, OldWaitcntInstr); @@ -1288,15 +1843,16 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, WaitcntBrackets &ScoreBrackets, MachineInstr *OldWaitcntInstr) { bool Modified = false; - const DebugLoc &DL = Block.findDebugLoc(It); if (OldWaitcntInstr) // Try to merge the required wait with preexisting waitcnt instructions. // Also erase redundant waitcnt. Modified = - applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It); - else - ScoreBrackets.applyWaitcnt(Wait); + WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It); + + // Any counts that could have been applied to any existing waitcnt + // instructions will have been done so, now deal with any remaining. + ScoreBrackets.applyWaitcnt(Wait); // ExpCnt can be merged into VINTERP. if (Wait.ExpCnt != ~0u && It != Block.instr_end() && @@ -1309,35 +1865,13 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, } Wait.ExpCnt = ~0u; - LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" + LLVM_DEBUG(dbgs() << "generateWaitcnt\n" << "Update Instr: " << *It); } - // Build new waitcnt instructions unless no wait is needed or the old waitcnt - // instruction was modified to handle the required wait. - if (Wait.hasWaitExceptVsCnt()) { - unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); - [[maybe_unused]] auto SWaitInst = - BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); + if (WCG->createNewWaitcnt(Block, It, Wait)) Modified = true; - LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; - if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; - dbgs() << "New Instr: " << *SWaitInst << '\n'); - } - - if (Wait.hasWaitVsCnt()) { - assert(ST->hasVscnt()); - - [[maybe_unused]] auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) - .addReg(AMDGPU::SGPR_NULL, RegState::Undef) - .addImm(Wait.VsCnt); - Modified = true; - - LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; - if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; - dbgs() << "New Instr: " << *SWaitInst << '\n'); - } return Modified; } @@ -1435,7 +1969,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, // Now look at the instruction opcode. If it is a memory access // instruction, update the upper-bound of the appropriate counter's // bracket and the destination operand scores. - // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere. + // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere. + if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) { if (TII->isAlwaysGDS(Inst.getOpcode()) || TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) { @@ -1486,7 +2021,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, } else if (Inst.isCall()) { if (callWaitsOnFunctionReturn(Inst)) { // Act as a wait on everything - ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZeroExceptVsCnt()); + ScoreBrackets->applyWaitcnt( + AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts())); } else { // May need to way wait for anything. ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt()); @@ -1545,7 +2081,7 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { VgprUB = std::max(VgprUB, Other.VgprUB); SgprUB = std::max(SgprUB, Other.SgprUB); - for (auto T : inst_counter_types()) { + for (auto T : inst_counter_types(MaxCounter)) { // Merge event flags for this counter const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T]; const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T]; @@ -1573,7 +2109,7 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { for (int J = 0; J <= VgprUB; J++) StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]); - if (T == LGKM_CNT) { + if (T == SmemAccessCounter) { for (int J = 0; J <= SgprUB; J++) StrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]); } @@ -1589,10 +2125,13 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { } static bool isWaitInstr(MachineInstr &Inst) { - auto Opcode = Inst.getOpcode(); - return SIInstrInfo::isWaitcnt(Opcode) || - (SIInstrInfo::isWaitcntVsCnt(Opcode) && Inst.getOperand(0).isReg() && - Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL); + unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode()); + return Opcode == AMDGPU::S_WAITCNT || + (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() && + Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) || + Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT || + Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT || + counterTypeForInstr(Opcode).has_value(); } // Generate s_waitcnt instructions where needed. @@ -1698,8 +2237,8 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, // an S_WAITCNT vmcnt(0) if (RequireCheckResourceType(Inst, context)) { // Force the score to as if an S_WAITCNT vmcnt(0) is emitted. - ScoreBrackets->setScoreLB(VM_CNT, - ScoreBrackets->getScoreUB(VM_CNT)); + ScoreBrackets->setScoreLB(LOAD_CNT, + ScoreBrackets->getScoreUB(LOAD_CNT)); } #endif @@ -1801,7 +2340,12 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, VgprUse.insert(RegNo); // If at least one of Op's registers is in the score brackets, the // value is likely loaded outside of the loop. - if (Brackets.getRegScore(RegNo, VM_CNT) > Brackets.getScoreLB(VM_CNT)) { + if (Brackets.getRegScore(RegNo, LOAD_CNT) > + Brackets.getScoreLB(LOAD_CNT) || + Brackets.getRegScore(RegNo, SAMPLE_CNT) > + Brackets.getScoreLB(SAMPLE_CNT) || + Brackets.getRegScore(RegNo, BVH_CNT) > + Brackets.getScoreLB(BVH_CNT)) { UsesVgprLoadedOutside = true; break; } @@ -1829,23 +2373,46 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { TII = ST->getInstrInfo(); TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); - IV = AMDGPU::getIsaVersion(ST->getCPU()); const SIMachineFunctionInfo *MFI = MF.getInfo(); MLI = &getAnalysis(); PDT = &getAnalysis(); + AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU()); + + if (ST->hasExtendedWaitCounts()) { + MaxCounter = NUM_EXTENDED_INST_CNTS; + WCGGFX12Plus = WaitcntGeneratorGFX12Plus(ST, MaxCounter); + WCG = &WCGGFX12Plus; + } else { + MaxCounter = NUM_NORMAL_INST_CNTS; + WCGPreGFX12 = WaitcntGeneratorPreGFX12(ST); + WCG = &WCGPreGFX12; + } + ForceEmitZeroWaitcnts = ForceEmitZeroFlag; for (auto T : inst_counter_types()) ForceEmitWaitcnt[T] = false; + const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask(); + + SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS); + OptNone = MF.getFunction().hasOptNone() || MF.getTarget().getOptLevel() == CodeGenOptLevel::None; HardwareLimits Limits = {}; - Limits.VmcntMax = AMDGPU::getVmcntBitMask(IV); + if (ST->hasExtendedWaitCounts()) { + Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV); + Limits.DscntMax = AMDGPU::getDscntBitMask(IV); + } else { + Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV); + Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV); + } Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV); - Limits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV); - Limits.VscntMax = ST->hasVscnt() ? 63 : 0; + Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV); + Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV); + Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV); + Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV); unsigned NumVGPRsMax = ST->getAddressableNumVGPRs(); unsigned NumSGPRsMax = ST->getAddressableNumSGPRs(); @@ -1863,6 +2430,9 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { BlockInfos.clear(); bool Modified = false; + MachineBasicBlock &EntryBB = MF.front(); + MachineBasicBlock::iterator I = EntryBB.begin(); + if (!MFI->isEntryFunction()) { // Wait for any outstanding memory operations that the input registers may // depend on. We can't track them and it's better to do the wait after the @@ -1870,15 +2440,28 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { // TODO: Could insert earlier and schedule more liberally with operations // that only use caller preserved registers. - MachineBasicBlock &EntryBB = MF.front(); - MachineBasicBlock::iterator I = EntryBB.begin(); for (MachineBasicBlock::iterator E = EntryBB.end(); I != E && (I->isPHI() || I->isMetaInstruction()); ++I) ; - BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); - auto NonKernelInitialState = - std::make_unique(ST, Limits, Encoding); + if (ST->hasExtendedWaitCounts()) { + BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT)) + .addImm(0); + for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) { + if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT) + continue; + + BuildMI(EntryBB, I, DebugLoc(), + TII->get(instrsForExtendedCounterTypes[CT])) + .addImm(0); + } + } else { + BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); + } + + auto NonKernelInitialState = std::make_unique( + ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst, + SmemAccessCounter); NonKernelInitialState->setNonKernelFunctionInitialState(); BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState); @@ -1909,9 +2492,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { *Brackets = *BI.Incoming; } else { if (!Brackets) - Brackets = std::make_unique(ST, Limits, Encoding); + Brackets = std::make_unique( + ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst, + SmemAccessCounter); else - *Brackets = WaitcntBrackets(ST, Limits, Encoding); + *Brackets = WaitcntBrackets(ST, MaxCounter, Limits, Encoding, + WaitEventMaskForInst, SmemAccessCounter); } Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index d4c7a457e9aae2..5380f6d1fc88f6 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -9065,8 +9065,7 @@ bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const { } int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { - if (SIInstrInfo::isSoftWaitcnt(Opcode)) - Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode); + Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode); unsigned Gen = subtargetEncodingFamily(ST); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 37ee159362a28c..6d0d818df84aa6 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -905,29 +905,24 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { } static unsigned getNonSoftWaitcntOpcode(unsigned Opcode) { - if (isWaitcnt(Opcode)) + switch (Opcode) { + case AMDGPU::S_WAITCNT_soft: return AMDGPU::S_WAITCNT; - - if (isWaitcntVsCnt(Opcode)) + case AMDGPU::S_WAITCNT_VSCNT_soft: return AMDGPU::S_WAITCNT_VSCNT; - - llvm_unreachable("Expected opcode S_WAITCNT/S_WAITCNT_VSCNT"); - } - - static bool isWaitcnt(unsigned Opcode) { - return Opcode == AMDGPU::S_WAITCNT || Opcode == AMDGPU::S_WAITCNT_soft; - } - - static bool isWaitcntVsCnt(unsigned Opcode) { - return Opcode == AMDGPU::S_WAITCNT_VSCNT || - Opcode == AMDGPU::S_WAITCNT_VSCNT_soft; - } - - // "Soft" waitcnt instructions can be relaxed/optimized out by - // SIInsertWaitcnts. - static bool isSoftWaitcnt(unsigned Opcode) { - return Opcode == AMDGPU::S_WAITCNT_soft || - Opcode == AMDGPU::S_WAITCNT_VSCNT_soft; + case AMDGPU::S_WAIT_LOADCNT_soft: + return AMDGPU::S_WAIT_LOADCNT; + case AMDGPU::S_WAIT_STORECNT_soft: + return AMDGPU::S_WAIT_STORECNT; + case AMDGPU::S_WAIT_SAMPLECNT_soft: + return AMDGPU::S_WAIT_SAMPLECNT; + case AMDGPU::S_WAIT_BVHCNT_soft: + return AMDGPU::S_WAIT_BVHCNT; + case AMDGPU::S_WAIT_DSCNT_soft: + return AMDGPU::S_WAIT_DSCNT; + default: + return Opcode; + } } bool isVGPRCopy(const MachineInstr &MI) const { diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 6d749ad1ad24f6..84b9330ef9633e 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -579,11 +579,30 @@ class SIGfx11CacheControl : public SIGfx10CacheControl { }; class SIGfx12CacheControl : public SIGfx11CacheControl { +protected: + // Sets TH policy to \p Value if CPol operand is present in instruction \p MI. + // \returns Returns true if \p MI is modified, false otherwise. + bool setTH(const MachineBasicBlock::iterator MI, + AMDGPU::CPol::CPol Value) const; + // Sets Scope policy to \p Value if CPol operand is present in instruction \p + // MI. \returns Returns true if \p MI is modified, false otherwise. + bool setScope(const MachineBasicBlock::iterator MI, + AMDGPU::CPol::CPol Value) const; + public: SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {} + bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, SIMemOp Op, + bool IsCrossAddrSpaceOrdering, Position Pos) const override; + bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, Position Pos) const override; + + bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, + SIAtomicAddrSpace AddrSpace, SIMemOp Op, + bool IsVolatile, + bool IsNonTemporal) const override; }; class SIMemoryLegalizer final : public MachineFunctionPass { @@ -2142,6 +2161,132 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal( return Changed; } +bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI, + AMDGPU::CPol::CPol Value) const { + MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); + if (!CPol) + return false; + + uint64_t NewTH = Value & AMDGPU::CPol::TH; + if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) { + CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH); + return true; + } + + return false; +} + +bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI, + AMDGPU::CPol::CPol Value) const { + MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); + if (!CPol) + return false; + + uint64_t NewScope = Value & AMDGPU::CPol::SCOPE; + if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) { + CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope); + return true; + } + + return false; +} + +bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, SIMemOp Op, + bool IsCrossAddrSpaceOrdering, + Position Pos) const { + bool Changed = false; + + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + bool LOADCnt = false; + bool DSCnt = false; + bool STORECnt = false; + + if (Pos == Position::AFTER) + ++MI; + + if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != + SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) + LOADCnt |= true; + if ((Op & SIMemOp::STORE) != SIMemOp::NONE) + STORECnt |= true; + break; + case SIAtomicScope::WORKGROUP: + // In WGP mode the waves of a work-group can be executing on either CU of + // the WGP. Therefore need to wait for operations to complete to ensure + // they are visible to waves in the other CU as the L0 is per CU. + // Otherwise in CU mode and all waves of a work-group are on the same CU + // which shares the same L0. + if (!ST.isCuModeEnabled()) { + if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) + LOADCnt |= true; + if ((Op & SIMemOp::STORE) != SIMemOp::NONE) + STORECnt |= true; + } + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // The L0 cache keeps all memory operations in order for + // work-items in the same wavefront. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + case SIAtomicScope::WORKGROUP: + // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is + // not needed as LDS operations for all waves are executed in a total + // global ordering as observed by all waves. Required if also + // synchronizing with global/GDS memory as LDS operations could be + // reordered with respect to later global/GDS memory operations of the + // same wave. + DSCnt |= IsCrossAddrSpaceOrdering; + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // The LDS keeps all memory operations in order for + // the same wavefront. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + if (LOADCnt) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0); + Changed = true; + } + + if (STORECnt) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0); + Changed = true; + } + + if (DSCnt) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0); + Changed = true; + } + + if (Pos == Position::AFTER) + --MI; + + return Changed; +} + bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, @@ -2198,6 +2343,41 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, return true; } +bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( + MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, + bool IsVolatile, bool IsNonTemporal) const { + + // Only handle load and store, not atomic read-modify-write instructions. + assert(MI->mayLoad() ^ MI->mayStore()); + + // Only update load and store, not LLVM IR atomic read-modify-write + // instructions. The latter are always marked as volatile so cannot sensibly + // handle it as do not want to pessimize all atomics. Also they do not support + // the nontemporal attribute. + assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); + + bool Changed = false; + + if (IsVolatile) { + Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); + + // Ensure operation has completed at system scope to cause all volatile + // operations to be visible outside the program in a global order. Do not + // request cross address space as only the global address space can be + // observable outside the program, so no need to cause a waitcnt for LDS + // address space operations. + Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, + Position::AFTER); + } + + if (IsNonTemporal) { + // Set non-temporal hint for all cache levels. + Changed |= setTH(MI, AMDGPU::CPol::TH_NT); + } + + return Changed; +} + bool SIMemoryLegalizer::removeAtomicPseudoMIs() { if (AtomicPseudoMIs.empty()) return false; diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 46fa3d57a21cb2..53b4faeb7e0345 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1597,6 +1597,13 @@ def S_WAITCNT : SOPP_Pseudo <"s_waitcnt" , (ins SWaitCnt:$simm16), "$simm16", // that doesn't access memory. def S_WAITCNT_soft : SOPP_Pseudo <"s_soft_waitcnt" , (ins SWaitCnt:$simm16), "$simm16">; def S_WAITCNT_VSCNT_soft : SOPK_WAITCNT<"s_soft_waitcnt_vscnt">; +let SubtargetPredicate = isGFX12Plus in { + def S_WAIT_LOADCNT_soft : SOPP_Pseudo <"s_soft_wait_loadcnt", (ins s16imm:$simm16), "$simm16">; + def S_WAIT_STORECNT_soft : SOPP_Pseudo <"s_soft_wait_storecnt", (ins s16imm:$simm16), "$simm16">; + def S_WAIT_SAMPLECNT_soft : SOPP_Pseudo <"s_soft_wait_samplecnt", (ins s16imm:$simm16), "$simm16">; + def S_WAIT_BVHCNT_soft : SOPP_Pseudo <"s_soft_wait_bvhcnt", (ins s16imm:$simm16), "$simm16">; + def S_WAIT_DSCNT_soft : SOPP_Pseudo <"s_soft_wait_dscnt", (ins s16imm:$simm16), "$simm16">; +} def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16", [(int_amdgcn_s_sethalt timm:$simm16)]>; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 26ba2575ff34ac..b4f7fc456f0bdd 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -94,6 +94,44 @@ unsigned getVmcntBitWidthHi(unsigned VersionMajor) { return (VersionMajor == 9 || VersionMajor == 10) ? 2 : 0; } +/// \returns Loadcnt bit width +unsigned getLoadcntBitWidth(unsigned VersionMajor) { + return VersionMajor >= 12 ? 6 : 0; +} + +/// \returns Samplecnt bit width. +unsigned getSamplecntBitWidth(unsigned VersionMajor) { + return VersionMajor >= 12 ? 6 : 0; +} + +/// \returns Bvhcnt bit width. +unsigned getBvhcntBitWidth(unsigned VersionMajor) { + return VersionMajor >= 12 ? 3 : 0; +} + +/// \returns Dscnt bit width. +unsigned getDscntBitWidth(unsigned VersionMajor) { + return VersionMajor >= 12 ? 6 : 0; +} + +/// \returns Dscnt bit shift in combined S_WAIT instructions. +unsigned getDscntBitShift(unsigned VersionMajor) { return 0; } + +/// \returns Storecnt or Vscnt bit width, depending on VersionMajor. +unsigned getStorecntBitWidth(unsigned VersionMajor) { + return VersionMajor >= 10 ? 6 : 0; +} + +/// \returns Kmcnt bit width. +unsigned getKmcntBitWidth(unsigned VersionMajor) { + return VersionMajor >= 12 ? 5 : 0; +} + +/// \returns shift for Loadcnt/Storecnt in combined S_WAIT instructions. +unsigned getLoadcntStorecntBitShift(unsigned VersionMajor) { + return VersionMajor >= 12 ? 8 : 0; +} + /// \returns VmVsrc bit width inline unsigned getVmVsrcBitWidth() { return 3; } @@ -1229,6 +1267,18 @@ unsigned getVmcntBitMask(const IsaVersion &Version) { 1; } +unsigned getLoadcntBitMask(const IsaVersion &Version) { + return (1 << getLoadcntBitWidth(Version.Major)) - 1; +} + +unsigned getSamplecntBitMask(const IsaVersion &Version) { + return (1 << getSamplecntBitWidth(Version.Major)) - 1; +} + +unsigned getBvhcntBitMask(const IsaVersion &Version) { + return (1 << getBvhcntBitWidth(Version.Major)) - 1; +} + unsigned getExpcntBitMask(const IsaVersion &Version) { return (1 << getExpcntBitWidth(Version.Major)) - 1; } @@ -1237,6 +1287,18 @@ unsigned getLgkmcntBitMask(const IsaVersion &Version) { return (1 << getLgkmcntBitWidth(Version.Major)) - 1; } +unsigned getDscntBitMask(const IsaVersion &Version) { + return (1 << getDscntBitWidth(Version.Major)) - 1; +} + +unsigned getKmcntBitMask(const IsaVersion &Version) { + return (1 << getKmcntBitWidth(Version.Major)) - 1; +} + +unsigned getStorecntBitMask(const IsaVersion &Version) { + return (1 << getStorecntBitWidth(Version.Major)) - 1; +} + unsigned getWaitcntBitMask(const IsaVersion &Version) { unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(Version.Major), getVmcntBitWidthLo(Version.Major)); @@ -1276,9 +1338,9 @@ void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded) { Waitcnt Decoded; - Decoded.VmCnt = decodeVmcnt(Version, Encoded); + Decoded.LoadCnt = decodeVmcnt(Version, Encoded); Decoded.ExpCnt = decodeExpcnt(Version, Encoded); - Decoded.LgkmCnt = decodeLgkmcnt(Version, Encoded); + Decoded.DsCnt = decodeLgkmcnt(Version, Encoded); return Decoded; } @@ -1313,7 +1375,85 @@ unsigned encodeWaitcnt(const IsaVersion &Version, } unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded) { - return encodeWaitcnt(Version, Decoded.VmCnt, Decoded.ExpCnt, Decoded.LgkmCnt); + return encodeWaitcnt(Version, Decoded.LoadCnt, Decoded.ExpCnt, Decoded.DsCnt); +} + +static unsigned getCombinedCountBitMask(const IsaVersion &Version, + bool IsStore) { + unsigned Dscnt = getBitMask(getDscntBitShift(Version.Major), + getDscntBitWidth(Version.Major)); + if (IsStore) { + unsigned Storecnt = getBitMask(getLoadcntStorecntBitShift(Version.Major), + getStorecntBitWidth(Version.Major)); + return Dscnt | Storecnt; + } else { + unsigned Loadcnt = getBitMask(getLoadcntStorecntBitShift(Version.Major), + getLoadcntBitWidth(Version.Major)); + return Dscnt | Loadcnt; + } +} + +Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt) { + Waitcnt Decoded; + Decoded.LoadCnt = + unpackBits(LoadcntDscnt, getLoadcntStorecntBitShift(Version.Major), + getLoadcntBitWidth(Version.Major)); + Decoded.DsCnt = unpackBits(LoadcntDscnt, getDscntBitShift(Version.Major), + getDscntBitWidth(Version.Major)); + return Decoded; +} + +Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt) { + Waitcnt Decoded; + Decoded.StoreCnt = + unpackBits(StorecntDscnt, getLoadcntStorecntBitShift(Version.Major), + getStorecntBitWidth(Version.Major)); + Decoded.DsCnt = unpackBits(StorecntDscnt, getDscntBitShift(Version.Major), + getDscntBitWidth(Version.Major)); + return Decoded; +} + +static unsigned encodeLoadcnt(const IsaVersion &Version, unsigned Waitcnt, + unsigned Loadcnt) { + return packBits(Loadcnt, Waitcnt, getLoadcntStorecntBitShift(Version.Major), + getLoadcntBitWidth(Version.Major)); +} + +static unsigned encodeStorecnt(const IsaVersion &Version, unsigned Waitcnt, + unsigned Storecnt) { + return packBits(Storecnt, Waitcnt, getLoadcntStorecntBitShift(Version.Major), + getStorecntBitWidth(Version.Major)); +} + +static unsigned encodeDscnt(const IsaVersion &Version, unsigned Waitcnt, + unsigned Dscnt) { + return packBits(Dscnt, Waitcnt, getDscntBitShift(Version.Major), + getDscntBitWidth(Version.Major)); +} + +static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, + unsigned Dscnt) { + unsigned Waitcnt = getCombinedCountBitMask(Version, false); + Waitcnt = encodeLoadcnt(Version, Waitcnt, Loadcnt); + Waitcnt = encodeDscnt(Version, Waitcnt, Dscnt); + return Waitcnt; +} + +unsigned encodeLoadcntDscnt(const IsaVersion &Version, const Waitcnt &Decoded) { + return encodeLoadcntDscnt(Version, Decoded.LoadCnt, Decoded.DsCnt); +} + +static unsigned encodeStorecntDscnt(const IsaVersion &Version, + unsigned Storecnt, unsigned Dscnt) { + unsigned Waitcnt = getCombinedCountBitMask(Version, true); + Waitcnt = encodeStorecnt(Version, Waitcnt, Storecnt); + Waitcnt = encodeDscnt(Version, Waitcnt, Dscnt); + return Waitcnt; +} + +unsigned encodeStorecntDscnt(const IsaVersion &Version, + const Waitcnt &Decoded) { + return encodeStorecntDscnt(Version, Decoded.StoreCnt, Decoded.DsCnt); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 50c741760d7143..351563e957f14a 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -837,39 +837,58 @@ getIntegerPairAttribute(const Function &F, StringRef Name, /// Large values (including the maximum possible integer) can be used to /// represent "don't care" waits. struct Waitcnt { - unsigned VmCnt = ~0u; + unsigned LoadCnt = ~0u; // Corresponds to Vmcnt prior to gfx12. unsigned ExpCnt = ~0u; - unsigned LgkmCnt = ~0u; - unsigned VsCnt = ~0u; + unsigned DsCnt = ~0u; // Corresponds to LGKMcnt prior to gfx12. + unsigned StoreCnt = ~0u; // Corresponds to VScnt on gfx10/gfx11. + unsigned SampleCnt = ~0u; // gfx12+ only. + unsigned BvhCnt = ~0u; // gfx12+ only. + unsigned KmCnt = ~0u; // gfx12+ only. Waitcnt() = default; + // Pre-gfx12 constructor. Waitcnt(unsigned VmCnt, unsigned ExpCnt, unsigned LgkmCnt, unsigned VsCnt) - : VmCnt(VmCnt), ExpCnt(ExpCnt), LgkmCnt(LgkmCnt), VsCnt(VsCnt) {} - - static Waitcnt allZero(bool HasVscnt) { - return Waitcnt(0, 0, 0, HasVscnt ? 0 : ~0u); + : LoadCnt(VmCnt), ExpCnt(ExpCnt), DsCnt(LgkmCnt), StoreCnt(VsCnt), + SampleCnt(~0u), BvhCnt(~0u), KmCnt(~0u) {} + + // gfx12+ constructor. + Waitcnt(unsigned LoadCnt, unsigned ExpCnt, unsigned DsCnt, unsigned StoreCnt, + unsigned SampleCnt, unsigned BvhCnt, unsigned KmCnt) + : LoadCnt(LoadCnt), ExpCnt(ExpCnt), DsCnt(DsCnt), StoreCnt(StoreCnt), + SampleCnt(SampleCnt), BvhCnt(BvhCnt), KmCnt(KmCnt) {} + + static Waitcnt allZero(bool Extended, bool HasStorecnt) { + return Extended ? Waitcnt(0, 0, 0, 0, 0, 0, 0) + : Waitcnt(0, 0, 0, HasStorecnt ? 0 : ~0u); } - static Waitcnt allZeroExceptVsCnt() { return Waitcnt(0, 0, 0, ~0u); } - bool hasWait() const { - return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u || VsCnt != ~0u; + static Waitcnt allZeroExceptVsCnt(bool Extended) { + return Extended ? Waitcnt(0, 0, 0, ~0u, 0, 0, 0) : Waitcnt(0, 0, 0, ~0u); } - bool hasWaitExceptVsCnt() const { - return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u; - } + bool hasWait() const { return StoreCnt != ~0u || hasWaitExceptStoreCnt(); } - bool hasWaitVsCnt() const { - return VsCnt != ~0u; + bool hasWaitExceptStoreCnt() const { + return LoadCnt != ~0u || ExpCnt != ~0u || DsCnt != ~0u || + SampleCnt != ~0u || BvhCnt != ~0u || KmCnt != ~0u; } + bool hasWaitStoreCnt() const { return StoreCnt != ~0u; } + Waitcnt combined(const Waitcnt &Other) const { - return Waitcnt(std::min(VmCnt, Other.VmCnt), std::min(ExpCnt, Other.ExpCnt), - std::min(LgkmCnt, Other.LgkmCnt), - std::min(VsCnt, Other.VsCnt)); + // Does the right thing provided self and Other are either both pre-gfx12 + // or both gfx12+. + return Waitcnt( + std::min(LoadCnt, Other.LoadCnt), std::min(ExpCnt, Other.ExpCnt), + std::min(DsCnt, Other.DsCnt), std::min(StoreCnt, Other.StoreCnt), + std::min(SampleCnt, Other.SampleCnt), std::min(BvhCnt, Other.BvhCnt), + std::min(KmCnt, Other.KmCnt)); } }; +// The following methods are only meaningful on targets that support +// S_WAITCNT. + /// \returns Vmcnt bit mask for given isa \p Version. unsigned getVmcntBitMask(const IsaVersion &Version); @@ -893,17 +912,19 @@ unsigned decodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt); /// Decodes Vmcnt, Expcnt and Lgkmcnt from given \p Waitcnt for given isa /// \p Version, and writes decoded values into \p Vmcnt, \p Expcnt and -/// \p Lgkmcnt respectively. +/// \p Lgkmcnt respectively. Should not be used on gfx12+, the instruction +/// which needs it is deprecated /// /// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are decoded as follows: /// \p Vmcnt = \p Waitcnt[3:0] (pre-gfx9) /// \p Vmcnt = \p Waitcnt[15:14,3:0] (gfx9,10) -/// \p Vmcnt = \p Waitcnt[15:10] (gfx11+) +/// \p Vmcnt = \p Waitcnt[15:10] (gfx11) /// \p Expcnt = \p Waitcnt[6:4] (pre-gfx11) -/// \p Expcnt = \p Waitcnt[2:0] (gfx11+) +/// \p Expcnt = \p Waitcnt[2:0] (gfx11) /// \p Lgkmcnt = \p Waitcnt[11:8] (pre-gfx10) /// \p Lgkmcnt = \p Waitcnt[13:8] (gfx10) -/// \p Lgkmcnt = \p Waitcnt[9:4] (gfx11+) +/// \p Lgkmcnt = \p Waitcnt[9:4] (gfx11) +/// void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt); @@ -922,26 +943,78 @@ unsigned encodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned Lgkmcnt); /// Encodes \p Vmcnt, \p Expcnt and \p Lgkmcnt into Waitcnt for given isa -/// \p Version. +/// \p Version. Should not be used on gfx12+, the instruction which needs +/// it is deprecated /// /// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are encoded as follows: /// Waitcnt[2:0] = \p Expcnt (gfx11+) /// Waitcnt[3:0] = \p Vmcnt (pre-gfx9) /// Waitcnt[3:0] = \p Vmcnt[3:0] (gfx9,10) /// Waitcnt[6:4] = \p Expcnt (pre-gfx11) -/// Waitcnt[9:4] = \p Lgkmcnt (gfx11+) +/// Waitcnt[9:4] = \p Lgkmcnt (gfx11) /// Waitcnt[11:8] = \p Lgkmcnt (pre-gfx10) /// Waitcnt[13:8] = \p Lgkmcnt (gfx10) -/// Waitcnt[15:10] = \p Vmcnt (gfx11+) +/// Waitcnt[15:10] = \p Vmcnt (gfx11) /// Waitcnt[15:14] = \p Vmcnt[5:4] (gfx9,10) /// /// \returns Waitcnt with encoded \p Vmcnt, \p Expcnt and \p Lgkmcnt for given /// isa \p Version. +/// unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt); unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded); +// The following methods are only meaningful on targets that support +// S_WAIT_*CNT, introduced with gfx12. + +/// \returns Loadcnt bit mask for given isa \p Version. +/// Returns 0 for versions that do not support LOADcnt +unsigned getLoadcntBitMask(const IsaVersion &Version); + +/// \returns Samplecnt bit mask for given isa \p Version. +/// Returns 0 for versions that do not support SAMPLEcnt +unsigned getSamplecntBitMask(const IsaVersion &Version); + +/// \returns Bvhcnt bit mask for given isa \p Version. +/// Returns 0 for versions that do not support BVHcnt +unsigned getBvhcntBitMask(const IsaVersion &Version); + +/// \returns Dscnt bit mask for given isa \p Version. +/// Returns 0 for versions that do not support DScnt +unsigned getDscntBitMask(const IsaVersion &Version); + +/// \returns Dscnt bit mask for given isa \p Version. +/// Returns 0 for versions that do not support KMcnt +unsigned getKmcntBitMask(const IsaVersion &Version); + +/// \return STOREcnt or VScnt bit mask for given isa \p Version. +/// returns 0 for versions that do not support STOREcnt or VScnt. +/// STOREcnt and VScnt are the same counter, the name used +/// depends on the ISA version. +unsigned getStorecntBitMask(const IsaVersion &Version); + +// The following are only meaningful on targets that support +// S_WAIT_LOADCNT_DSCNT and S_WAIT_STORECNT_DSCNT. + +/// \returns Decoded Waitcnt structure from given \p LoadcntDscnt for given +/// isa \p Version. +Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt); + +/// \returns Decoded Waitcnt structure from given \p StorecntDscnt for given +/// isa \p Version. +Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt); + +/// \returns \p Loadcnt and \p Dscnt components of \p Decoded encoded as an +/// immediate that can be used with S_WAIT_LOADCNT_DSCNT for given isa +/// \p Version. +unsigned encodeLoadcntDscnt(const IsaVersion &Version, const Waitcnt &Decoded); + +/// \returns \p Storecnt and \p Dscnt components of \p Decoded encoded as an +/// immediate that can be used with S_WAIT_STORECNT_DSCNT for given isa +/// \p Version. +unsigned encodeStorecntDscnt(const IsaVersion &Version, const Waitcnt &Decoded); + namespace Hwreg { LLVM_READONLY diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll index b850c37c4a2810..e3243ff186e3ee 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll @@ -25,7 +25,7 @@ define amdgpu_kernel void @s_add_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) { ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -77,7 +77,7 @@ define amdgpu_kernel void @s_sub_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) { ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[6:7], s[0:1] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll index 62e5bce23664cd..c7676e9da6f499 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll @@ -11,7 +11,11 @@ define float @test_fmed3_f32_known_nnan_ieee_true(float %a) #0 { ; ; GFX12-LABEL: test_fmed3_f32_known_nnan_ieee_true: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp ; GFX12-NEXT: s_setpc_b64 s[30:31] %fmul = fmul float %a, 2.0 @@ -28,7 +32,11 @@ define half @test_fmed3_f16_known_nnan_ieee_false(half %a) #1 { ; ; GFX12-LABEL: test_fmed3_f16_known_nnan_ieee_false: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mul_f16_e64 v0, v0, 2.0 clamp ; GFX12-NEXT: s_setpc_b64 s[30:31] %fmul = fmul half %a, 2.0 @@ -47,7 +55,11 @@ define float @test_fmed3_non_SNaN_input_ieee_true_dx10clamp_true(float %a) #2 { ; ; GFX12-LABEL: test_fmed3_non_SNaN_input_ieee_true_dx10clamp_true: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e64 v0, 0x41200000, v0 clamp @@ -67,7 +79,11 @@ define float @test_fmed3_maybe_SNaN_input_zero_third_operand_ieee_true_dx10clamp ; ; GFX12-LABEL: test_fmed3_maybe_SNaN_input_zero_third_operand_ieee_true_dx10clamp_true: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp ; GFX12-NEXT: s_setpc_b64 s[30:31] %fmul = fmul float %a, 2.0 @@ -86,7 +102,11 @@ define float @test_fmed3_global_nnan(float %a) #3 { ; ; GFX12-LABEL: test_fmed3_global_nnan: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp ; GFX12-NEXT: s_setpc_b64 s[30:31] %fmul = fmul float %a, 2.0 @@ -109,7 +129,11 @@ define float @test_fmed3_f32_maybe_NaN_ieee_false(float %a) #1 { ; ; GFX12-LABEL: test_fmed3_f32_maybe_NaN_ieee_false: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp ; GFX12-NEXT: s_setpc_b64 s[30:31] %fmul = fmul float %a, 2.0 @@ -129,7 +153,11 @@ define float @test_fmed3_non_SNaN_input_ieee_true_dx10clamp_false(float %a) #4 { ; ; GFX12-LABEL: test_fmed3_non_SNaN_input_ieee_true_dx10clamp_false: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e64 v0, 0x41200000, v0 clamp @@ -149,7 +177,11 @@ define float @test_fmed3_maybe_SNaN_input_ieee_true_dx10clamp_true(float %a) #2 ; ; GFX12-LABEL: test_fmed3_maybe_SNaN_input_ieee_true_dx10clamp_true: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp ; GFX12-NEXT: s_setpc_b64 s[30:31] %fmul = fmul float %a, 2.0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll index bba3687dbbc2cc..ca0047bba6c4bd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll @@ -11,7 +11,11 @@ define float @test_min_max_ValK0_K1_f32(float %a) #0 { ; ; GFX12-LABEL: test_min_max_ValK0_K1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp ; GFX12-NEXT: s_setpc_b64 s[30:31] %fmul = fmul float %a, 2.0 @@ -29,7 +33,11 @@ define double @test_min_max_K0Val_K1_f64(double %a) #1 { ; ; GFX12-LABEL: test_min_max_K0Val_K1_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mul_f64_e64 v[0:1], v[0:1], 2.0 clamp ; GFX12-NEXT: s_setpc_b64 s[30:31] %fmul = fmul double %a, 2.0 @@ -48,7 +56,11 @@ define half @test_min_K1max_ValK0_f16(half %a) #2 { ; ; GFX12-LABEL: test_min_K1max_ValK0_f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mul_f16_e64 v0, v0, 2.0 clamp ; GFX12-NEXT: s_setpc_b64 s[30:31] %fmul = fmul half %a, 2.0 @@ -66,7 +78,11 @@ define <2 x half> @test_min_K1max_K0Val_f16(<2 x half> %a) #1 { ; ; GFX12-LABEL: test_min_K1max_K0Val_f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_pk_mul_f16 v0, v0, 2.0 op_sel_hi:[1,0] clamp ; GFX12-NEXT: s_setpc_b64 s[30:31] %fmul = fmul <2 x half> %a, @@ -84,7 +100,11 @@ define <2 x half> @test_min_max_splat_padded_with_undef(<2 x half> %a) #2 { ; ; GFX12-LABEL: test_min_max_splat_padded_with_undef: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_pk_mul_f16 v0, v0, 2.0 op_sel_hi:[1,0] clamp ; GFX12-NEXT: s_setpc_b64 s[30:31] %fmul = fmul <2 x half> %a, @@ -104,7 +124,11 @@ define float @test_max_min_ValK1_K0_f32(float %a) #0 { ; ; GFX12-LABEL: test_max_min_ValK1_K0_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp ; GFX12-NEXT: s_setpc_b64 s[30:31] %fmul = fmul float %a, 2.0 @@ -122,7 +146,11 @@ define double @test_max_min_K1Val_K0_f64(double %a) #1 { ; ; GFX12-LABEL: test_max_min_K1Val_K0_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mul_f64_e64 v[0:1], v[0:1], 2.0 clamp ; GFX12-NEXT: s_setpc_b64 s[30:31] %fmul = fmul double %a, 2.0 @@ -140,7 +168,11 @@ define half @test_max_K0min_ValK1_f16(half %a) #0 { ; ; GFX12-LABEL: test_max_K0min_ValK1_f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mul_f16_e64 v0, v0, 2.0 clamp ; GFX12-NEXT: s_setpc_b64 s[30:31] %fmul = fmul half %a, 2.0 @@ -159,7 +191,11 @@ define <2 x half> @test_max_K0min_K1Val_v2f16(<2 x half> %a) #1 { ; ; GFX12-LABEL: test_max_K0min_K1Val_v2f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_pk_mul_f16 v0, v0, 2.0 op_sel_hi:[1,0] clamp ; GFX12-NEXT: s_setpc_b64 s[30:31] %fmul = fmul <2 x half> %a, @@ -179,7 +215,11 @@ define float @test_min_max_global_nnan(float %a) #3 { ; ; GFX12-LABEL: test_min_max_global_nnan: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v0, v0, v0 clamp ; GFX12-NEXT: s_setpc_b64 s[30:31] %maxnum = call float @llvm.maxnum.f32(float %a, float 0.0) @@ -196,7 +236,11 @@ define float @test_max_min_global_nnan(float %a) #3 { ; ; GFX12-LABEL: test_max_min_global_nnan: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v0, v0, v0 clamp ; GFX12-NEXT: s_setpc_b64 s[30:31] %minnum = call float @llvm.minnum.f32(float %a, float 1.0) @@ -219,7 +263,11 @@ define float @test_min_max_K0_gt_K1(float %a) #0 { ; ; GFX12-LABEL: test_min_max_K0_gt_K1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_maxmin_num_f32 v0, v0, 1.0, 0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %maxnum = call nnan float @llvm.maxnum.f32(float %a, float 1.0) @@ -238,7 +286,11 @@ define float @test_max_min_K0_gt_K1(float %a) #0 { ; ; GFX12-LABEL: test_max_min_K0_gt_K1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_minmax_num_f32 v0, v0, 0, 1.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %minnum = call nnan float @llvm.minnum.f32(float %a, float 0.0) @@ -260,7 +312,11 @@ define float @test_min_max_maybe_NaN_input_ieee_false(float %a) #1 { ; ; GFX12-LABEL: test_min_max_maybe_NaN_input_ieee_false: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp ; GFX12-NEXT: s_setpc_b64 s[30:31] %fmul = fmul float %a, 2.0 @@ -280,7 +336,11 @@ define float @test_min_max_maybe_NaN_input_ieee_true_dx10clamp_false(float %a) # ; ; GFX12-LABEL: test_min_max_maybe_NaN_input_ieee_true_dx10clamp_false: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp ; GFX12-NEXT: s_setpc_b64 s[30:31] %fmul = fmul float %a, 2.0 @@ -302,7 +362,11 @@ define float @test_max_min_maybe_NaN_input_ieee_true(float %a) #0 { ; ; GFX12-LABEL: test_max_min_maybe_NaN_input_ieee_true: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mul_f32_e32 v0, 2.0, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_minmax_num_f32 v0, v0, 1.0, 0 @@ -324,7 +388,11 @@ define float @test_max_min_maybe_NaN_input_ieee_false(float %a) #1 { ; ; GFX12-LABEL: test_max_min_maybe_NaN_input_ieee_false: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mul_f32_e32 v0, 2.0, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_minmax_num_f32 v0, v0, 1.0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll index e132f7a0ec757a..b58c3b20986363 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll @@ -20,7 +20,11 @@ define i32 @v_extract_v64i32_varidx(ptr addrspace(1) %ptr, i32 %idx) { ; ; GFX12-LABEL: v_extract_v64i32_varidx: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v2, 63, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v2 @@ -29,7 +33,7 @@ define i32 @v_extract_v64i32_varidx(ptr addrspace(1) %ptr, i32 %idx) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-NEXT: global_load_b32 v0, v[0:1], off -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %vec = load <64 x i32>, ptr addrspace(1) %ptr %elt = extractelement <64 x i32> %vec, i32 %idx @@ -51,7 +55,11 @@ define i16 @v_extract_v128i16_varidx(ptr addrspace(1) %ptr, i32 %idx) { ; ; GFX12-LABEL: v_extract_v128i16_varidx: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v2, 0x7f, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 1, v2 @@ -60,7 +68,7 @@ define i16 @v_extract_v128i16_varidx(ptr addrspace(1) %ptr, i32 %idx) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-NEXT: global_load_u16 v0, v[0:1], off -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %vec = load <128 x i16>, ptr addrspace(1) %ptr %elt = extractelement <128 x i16> %vec, i32 %idx @@ -82,7 +90,11 @@ define i64 @v_extract_v32i64_varidx(ptr addrspace(1) %ptr, i32 %idx) { ; ; GFX12-LABEL: v_extract_v32i64_varidx: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v2, 31, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v2 @@ -91,7 +103,7 @@ define i64 @v_extract_v32i64_varidx(ptr addrspace(1) %ptr, i32 %idx) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %vec = load <32 x i64>, ptr addrspace(1) %ptr %elt = extractelement <32 x i64> %vec, i32 %idx diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index 4603fbcd525c78..7a953c3ad80ad4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -79,17 +79,17 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX12-LABEL: store_load_sindex_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_and_b32 s1, s0, 15 ; GFX12-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-NEXT: s_lshl_b32 s1, s1, 2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: s_add_co_i32 s0, s0, 4 -; GFX12-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:4 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) @@ -168,11 +168,11 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 -; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:4 th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: v_add_nc_u32_e32 v1, 4, v1 -; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) @@ -247,16 +247,20 @@ define void @store_load_vindex_foo(i32 %idx) { ; ; GFX12-LABEL: store_load_vindex_foo: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0 ; GFX12-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_nc_u32_e32 v1, s32, v1 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: scratch_store_b32 v1, v2, off th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v0, s32 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b32 v1, v2, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b32 v0, v0, s32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] bb: %i = alloca [32 x float], align 4, addrspace(5) @@ -301,7 +305,11 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) { ; ; GFX12-LABEL: private_ptr_foo: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 0x41200000 ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:4 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -391,18 +399,19 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX12-LABEL: store_load_sindex_small_offset_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-NEXT: scratch_load_b32 v2, off, off offset:4 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: scratch_load_b32 v2, off, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_and_b32 s1, s0, 15 ; GFX12-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-NEXT: s_lshl_b32 s1, s1, 2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: s_addk_co_i32 s0, 0x104 -; GFX12-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:260 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:260 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) @@ -490,14 +499,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 -; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:260 th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:260 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: v_add_nc_u32_e32 v1, 0x104, v1 -; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) @@ -588,18 +597,22 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; ; GFX12-LABEL: store_load_vindex_small_offset_foo: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0 ; GFX12-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX12-NEXT: s_add_co_i32 s0, s32, 0x100 -; GFX12-NEXT: scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_load_b32 v3, off, s32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: scratch_store_b32 v1, v2, off th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v0, s32 offset:256 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b32 v1, v2, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b32 v0, v0, s32 offset:256 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] bb: %padding = alloca [64 x i32], align 4, addrspace(5) @@ -697,18 +710,19 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX12-LABEL: store_load_sindex_large_offset_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-NEXT: scratch_load_b32 v2, off, off offset:4 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: scratch_load_b32 v2, off, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_and_b32 s1, s0, 15 ; GFX12-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-NEXT: s_lshl_b32 s1, s1, 2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: s_addk_co_i32 s0, 0x4004 -; GFX12-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:16388 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:16388 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) @@ -798,14 +812,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 -; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:16388 th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:16388 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: v_add_nc_u32_e32 v1, 0x4004, v1 -; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) @@ -898,18 +912,22 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; ; GFX12-LABEL: store_load_vindex_large_offset_foo: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0 ; GFX12-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX12-NEXT: s_add_co_i32 s0, s32, 0x4000 -; GFX12-NEXT: scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_load_b32 v3, off, s32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: scratch_store_b32 v1, v2, off th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v0, s32 offset:16384 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b32 v1, v2, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b32 v0, v0, s32 offset:16384 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] bb: %padding = alloca [4096 x i32], align 4, addrspace(5) @@ -988,12 +1006,12 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX12-LABEL: store_load_large_imm_offset_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 -; GFX12-NEXT: scratch_store_b32 off, v0, off offset:4 th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_store_b32 off, v1, off offset:16004 th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, off, off offset:16004 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b32 off, v0, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_store_b32 off, v1, off offset:16004 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b32 v0, off, off offset:16004 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm bb: %i = alloca [4096 x i32], align 4, addrspace(5) @@ -1068,14 +1086,18 @@ define void @store_load_large_imm_offset_foo() { ; ; GFX12-LABEL: store_load_large_imm_offset_foo: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 -; GFX12-NEXT: scratch_store_b32 off, v0, s32 th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_store_b32 off, v1, s32 offset:16000 th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, off, s32 offset:16000 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b32 off, v0, s32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_store_b32 off, v1, s32 offset:16000 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b32 v0, off, s32 offset:16000 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] bb: %i = alloca [4096 x i32], align 4, addrspace(5) @@ -1152,14 +1174,14 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 15 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_lshl_u32 v0, s0, v0, 2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_nc_u32_e32 v0, 4, v0 -; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:1024 th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v0, off offset:1024 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:1024 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b32 v0, v0, off offset:1024 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm bb: %alloca = alloca [32 x i32], align 4, addrspace(5) @@ -1218,13 +1240,17 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) { ; ; GFX12-LABEL: store_load_i64_aligned: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 15 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: scratch_store_b64 v0, v[1:2], off th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b64 v[0:1], v0, off th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] bb: store volatile i64 15, ptr addrspace(5) %arg, align 8 @@ -1278,13 +1304,17 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) { ; ; GFX12-LABEL: store_load_i64_unaligned: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 15 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: scratch_store_b64 v0, v[1:2], off th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b64 v[0:1], v0, off th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] bb: store volatile i64 15, ptr addrspace(5) %arg, align 1 @@ -1354,16 +1384,20 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) { ; ; GFX12-LABEL: store_load_v3i32_unaligned: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s2, 3 ; GFX12-NEXT: s_mov_b32 s1, 2 ; GFX12-NEXT: s_mov_b32 s0, 1 ; GFX12-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1 ; GFX12-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-NEXT: scratch_store_b96 v0, v[1:3], off th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b96 v[0:2], v0, off th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] bb: store volatile <3 x i32> , ptr addrspace(5) %arg, align 1 @@ -1438,17 +1472,21 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) { ; ; GFX12-LABEL: store_load_v4i32_unaligned: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s3, 4 ; GFX12-NEXT: s_mov_b32 s2, 3 ; GFX12-NEXT: s_mov_b32 s1, 2 ; GFX12-NEXT: s_mov_b32 s0, 1 ; GFX12-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 ; GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX12-NEXT: scratch_store_b128 v0, v[1:4], off th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b128 v[0:3], v0, off th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] bb: store volatile <4 x i32> , ptr addrspace(5) %arg, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll index 096ca5bc8705f5..75c4cd53e3bfc7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll @@ -18,7 +18,11 @@ define float @test_min_max_ValK0_K1_f32(float %a) #0 { ; ; GFX12-LABEL: test_min_max_ValK0_K1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %maxnum = call nnan float @llvm.maxnum.f32(float %a, float 2.0) @@ -41,7 +45,11 @@ define float @test_min_max_K0Val_K1_f32(float %a) #1 { ; ; GFX12-LABEL: test_min_max_K0Val_K1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %maxnum = call nnan float @llvm.maxnum.f32(float 2.0, float %a) @@ -69,7 +77,11 @@ define half @test_min_K1max_ValK0_f16(half %a) #0 { ; ; GFX12-LABEL: test_min_K1max_ValK0_f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_med3_num_f16 v0, v0, 2.0, 4.0 @@ -95,7 +107,11 @@ define half @test_min_K1max_K0Val_f16(half %a) #1 { ; ; GFX12-LABEL: test_min_K1max_K0Val_f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_med3_num_f16 v0, v0, 2.0, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %maxnum = call nnan half @llvm.maxnum.f16(half 2.0, half %a) @@ -119,7 +135,11 @@ define float @test_max_min_ValK1_K0_f32(float %a) #0 { ; ; GFX12-LABEL: test_max_min_ValK1_K0_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %minnum = call nnan float @llvm.minnum.f32(float %a, float 4.0) @@ -142,7 +162,11 @@ define float @test_max_min_K1Val_K0_f32(float %a) #1 { ; ; GFX12-LABEL: test_max_min_K1Val_K0_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %minnum = call nnan float @llvm.minnum.f32(float 4.0, float %a) @@ -166,7 +190,11 @@ define half @test_max_K0min_ValK1_f16(half %a) #0 { ; ; GFX12-LABEL: test_max_K0min_ValK1_f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_med3_num_f16 v0, v0, 2.0, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %minnum = call nnan half @llvm.minnum.f16(half %a, half 4.0) @@ -190,7 +218,11 @@ define half @test_max_K0min_K1Val_f16(half %a) #1 { ; ; GFX12-LABEL: test_max_K0min_K1Val_f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_med3_num_f16 v0, v0, 2.0, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %minnum = call nnan half @llvm.minnum.f16(half 4.0, half %a) @@ -215,7 +247,11 @@ define float @test_min_max_global_nnan(float %a) #2 { ; ; GFX12-LABEL: test_min_max_global_nnan: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %maxnum = call float @llvm.maxnum.f32(float %a, float 2.0) @@ -238,7 +274,11 @@ define float @test_max_min_global_nnan(float %a) #2 { ; ; GFX12-LABEL: test_max_min_global_nnan: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %minnum = call float @llvm.minnum.f32(float %a, float 4.0) @@ -268,7 +308,11 @@ define float @test_min_max_K0_gt_K1(float %a) #0 { ; ; GFX12-LABEL: test_min_max_K0_gt_K1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_maxmin_num_f32 v0, v0, 4.0, 2.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %maxnum = call nnan float @llvm.maxnum.f32(float %a, float 4.0) @@ -294,7 +338,11 @@ define float @test_max_min_K0_gt_K1(float %a) #0 { ; ; GFX12-LABEL: test_max_min_K0_gt_K1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_minmax_num_f32 v0, v0, 2.0, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %minnum = call nnan float @llvm.minnum.f32(float %a, float 2.0) @@ -320,7 +368,11 @@ define float @test_min_max_non_inline_const(float %a) #0 { ; ; GFX12-LABEL: test_min_max_non_inline_const: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_maxmin_num_f32 v0, v0, 2.0, 0x41000000 ; GFX12-NEXT: s_setpc_b64 s[30:31] %maxnum = call nnan float @llvm.maxnum.f32(float %a, float 2.0) @@ -347,7 +399,11 @@ define double @test_min_max_f64(double %a) #0 { ; ; GFX12-LABEL: test_min_max_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], 2.0, v[0:1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[0:1], 4.0, v[0:1] @@ -379,7 +435,11 @@ define <2 x half> @test_min_max_v2f16(<2 x half> %a) #0 { ; ; GFX12-LABEL: test_min_max_v2f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v0, v0, 2.0 op_sel_hi:[1,0] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v0, v0, 4.0 op_sel_hi:[1,0] @@ -409,7 +469,11 @@ define float @test_min_max_maybe_NaN_input_ieee_false(float %a) #1 { ; ; GFX12-LABEL: test_min_max_maybe_NaN_input_ieee_false: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0 @@ -438,7 +502,11 @@ define float @test_max_min_maybe_NaN_input_ieee_false(float %a) #1 { ; ; GFX12-LABEL: test_max_min_maybe_NaN_input_ieee_false: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_minmax_num_f32 v0, v0, 4.0, 2.0 @@ -468,7 +536,11 @@ define float @test_max_min_maybe_NaN_input_ieee_true(float %a) #0 { ; ; GFX12-LABEL: test_max_min_maybe_NaN_input_ieee_true: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_minmax_num_f32 v0, v0, 4.0, 2.0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll index d76c9e4c1c1346..ade6e55b482bb7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll @@ -21,9 +21,13 @@ define i32 @global_atomic_csub(ptr addrspace(1) %ptr, i32 %data) { ; ; GFX12-LABEL: global_atomic_csub: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %data) ret i32 %ret @@ -50,9 +54,13 @@ define i32 @global_atomic_csub_offset(ptr addrspace(1) %ptr, i32 %data) { ; ; GFX12-LABEL: global_atomic_csub_offset: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024 %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data) @@ -76,9 +84,13 @@ define void @global_atomic_csub_nortn(ptr addrspace(1) %ptr, i32 %data) { ; ; GFX12-LABEL: global_atomic_csub_nortn: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %data) ret void @@ -105,9 +117,13 @@ define void @global_atomic_csub_offset_nortn(ptr addrspace(1) %ptr, i32 %data) { ; ; GFX12-LABEL: global_atomic_csub_offset_nortn: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024 %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data) @@ -145,10 +161,10 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1) ; GFX12-LABEL: global_atomic_csub_sgpr_base_offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[0:1], v0, off ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -184,7 +200,7 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspa ; GFX12-LABEL: global_atomic_csub_sgpr_base_offset_nortn: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll index b2eb24272e1e51..a3796197e4c534 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll @@ -44,7 +44,7 @@ define amdgpu_ps float @atomic_swap_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -92,7 +92,7 @@ define amdgpu_ps float @atomic_add_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -140,7 +140,7 @@ define amdgpu_ps float @atomic_sub_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_sub_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -188,7 +188,7 @@ define amdgpu_ps float @atomic_smin_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_min_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -236,7 +236,7 @@ define amdgpu_ps float @atomic_umin_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_min_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -284,7 +284,7 @@ define amdgpu_ps float @atomic_smax_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_max_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -332,7 +332,7 @@ define amdgpu_ps float @atomic_umax_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_max_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -380,7 +380,7 @@ define amdgpu_ps float @atomic_and_i321d(<8 x i32> inreg %rsrc, i32 %data, i16 % ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_and v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -428,7 +428,7 @@ define amdgpu_ps float @atomic_or_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16 % ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_or v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -476,7 +476,7 @@ define amdgpu_ps float @atomic_xor_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_xor v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -524,7 +524,7 @@ define amdgpu_ps float @atomic_inc_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_inc_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -572,7 +572,7 @@ define amdgpu_ps float @atomic_dec_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_dec_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -620,7 +620,7 @@ define amdgpu_ps float @atomic_cmpswap_i32_1d(<8 x i32> inreg %rsrc, i32 %cmp, i ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i16(i32 %cmp, i32 %swap, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -674,7 +674,7 @@ define amdgpu_ps float @atomic_add_i32_2d(<8 x i32> inreg %rsrc, i32 %data, i16 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i16(i32 %data, i16 %s, i16 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -728,7 +728,7 @@ define amdgpu_ps float @atomic_add_i32_3d(<8 x i32> inreg %rsrc, i32 %data, i16 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v0, [v1, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.3d.i32.i16(i32 %data, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 0, i32 0) @@ -782,7 +782,7 @@ define amdgpu_ps float @atomic_add_i32_cube(<8 x i32> inreg %rsrc, i32 %data, i1 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v0, [v1, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.cube.i32.i16(i32 %data, i16 %s, i16 %t, i16 %face, <8 x i32> %rsrc, i32 0, i32 0) @@ -836,7 +836,7 @@ define amdgpu_ps float @atomic_add_i32_1darray(<8 x i32> inreg %rsrc, i32 %data, ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.1darray.i32.i16(i32 %data, i16 %s, i16 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -890,7 +890,7 @@ define amdgpu_ps float @atomic_add_i32_2darray(<8 x i32> inreg %rsrc, i32 %data, ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v0, [v1, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.2darray.i32.i16(i32 %data, i16 %s, i16 %t, i16 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -944,7 +944,7 @@ define amdgpu_ps float @atomic_add_i32_2dmsaa(<8 x i32> inreg %rsrc, i32 %data, ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v0, [v1, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.2dmsaa.i32.i16(i32 %data, i16 %s, i16 %t, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -1004,7 +1004,7 @@ define amdgpu_ps float @atomic_add_i32_2darraymsaa(<8 x i32> inreg %rsrc, i32 %d ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.2darraymsaa.i32.i16(i32 %data, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -1052,7 +1052,7 @@ define amdgpu_ps float @atomic_add_i32_1d_slc(<8 x i32> inreg %rsrc, i32 %data, ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 2) @@ -1100,7 +1100,7 @@ define amdgpu_ps <2 x float> @atomic_swap_i64_1d(<8 x i32> inreg %rsrc, i64 %dat ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1148,7 +1148,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_1d(<8 x i32> inreg %rsrc, i64 %data ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1196,7 +1196,7 @@ define amdgpu_ps <2 x float> @atomic_sub_i64_1d(<8 x i32> inreg %rsrc, i64 %data ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_sub_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.sub.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1244,7 +1244,7 @@ define amdgpu_ps <2 x float> @atomic_smin_i64_1d(<8 x i32> inreg %rsrc, i64 %dat ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_min_int v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.smin.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1292,7 +1292,7 @@ define amdgpu_ps <2 x float> @atomic_umin_i64_1d(<8 x i32> inreg %rsrc, i64 %dat ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_min_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.umin.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1340,7 +1340,7 @@ define amdgpu_ps <2 x float> @atomic_smax_i64_1d(<8 x i32> inreg %rsrc, i64 %dat ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_max_int v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.smax.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1388,7 +1388,7 @@ define amdgpu_ps <2 x float> @atomic_umax_i64_1d(<8 x i32> inreg %rsrc, i64 %dat ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_max_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.umax.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1436,7 +1436,7 @@ define amdgpu_ps <2 x float> @atomic_and_i64_1d(<8 x i32> inreg %rsrc, i64 %data ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_and v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.and.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1484,7 +1484,7 @@ define amdgpu_ps <2 x float> @atomic_or_i64_1d(<8 x i32> inreg %rsrc, i64 %data, ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_or v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.or.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1532,7 +1532,7 @@ define amdgpu_ps <2 x float> @atomic_xor_i64_1d(<8 x i32> inreg %rsrc, i64 %data ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_xor v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.xor.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1580,7 +1580,7 @@ define amdgpu_ps <2 x float> @atomic_inc_i64_1d(<8 x i32> inreg %rsrc, i64 %data ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_inc_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.inc.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1628,7 +1628,7 @@ define amdgpu_ps <2 x float> @atomic_dec_i64_1d(<8 x i32> inreg %rsrc, i64 %data ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_dec_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.dec.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1676,7 +1676,7 @@ define amdgpu_ps <2 x float> @atomic_cmpswap_i64_1d(<8 x i32> inreg %rsrc, i64 % ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i16(i64 %cmp, i64 %swap, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1730,7 +1730,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2d(<8 x i32> inreg %rsrc, i64 %data ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.2d.i64.i16(i64 %data, i16 %s, i16 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -1784,7 +1784,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_3d(<8 x i32> inreg %rsrc, i64 %data ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v[0:1], [v2, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.3d.i64.i16(i64 %data, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 0, i32 0) @@ -1838,7 +1838,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_cube(<8 x i32> inreg %rsrc, i64 %da ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v[0:1], [v2, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_CUBE th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.cube.i64.i16(i64 %data, i16 %s, i16 %t, i16 %face , <8 x i32> %rsrc, i32 0, i32 0) @@ -1892,7 +1892,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_1darray(<8 x i32> inreg %rsrc, i64 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D_ARRAY th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.1darray.i64.i16(i64 %data, i16 %s, i16 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -1946,7 +1946,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2darray(<8 x i32> inreg %rsrc, i64 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v[0:1], [v2, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_ARRAY th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.2darray.i64.i16(i64 %data, i16 %s, i16 %t, i16 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -2000,7 +2000,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2dmsaa(<8 x i32> inreg %rsrc, i64 % ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v[0:1], [v2, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.2dmsaa.i64.i16(i64 %data, i16 %s, i16 %t, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -2060,7 +2060,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2darraymsaa(<8 x i32> inreg %rsrc, ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v[0:1], [v2, v3], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY th:TH_ATOMIC_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.2darraymsaa.i64.i16(i64 %data, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -2108,7 +2108,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_1d_slc(<8 x i32> inreg %rsrc, i64 % ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 2) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll index 3c82cff00bda3c..221e2fd4f00f73 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll @@ -90,7 +90,7 @@ define amdgpu_ps float @atomic_swap_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i32 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -181,7 +181,7 @@ define amdgpu_ps float @atomic_add_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i32 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -272,7 +272,7 @@ define amdgpu_ps float @atomic_sub_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i32 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_sub_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -363,7 +363,7 @@ define amdgpu_ps float @atomic_smin_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i32 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_min_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -454,7 +454,7 @@ define amdgpu_ps float @atomic_umin_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i32 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_min_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -545,7 +545,7 @@ define amdgpu_ps float @atomic_smax_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i32 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_max_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -636,7 +636,7 @@ define amdgpu_ps float @atomic_umax_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i32 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_max_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -727,7 +727,7 @@ define amdgpu_ps float @atomic_and_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i32 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_and v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -818,7 +818,7 @@ define amdgpu_ps float @atomic_or_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i32 % ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_or v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -909,7 +909,7 @@ define amdgpu_ps float @atomic_xor_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i32 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_xor v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1000,7 +1000,7 @@ define amdgpu_ps float @atomic_inc_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i32 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_inc_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1091,7 +1091,7 @@ define amdgpu_ps float @atomic_dec_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i32 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_dec_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1181,7 +1181,7 @@ define amdgpu_ps float @atomic_cmpswap_i32_1d(<8 x i32> inreg %rsrc, i32 %cmp, i ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1356,7 +1356,7 @@ define amdgpu_ps float @atomic_add_i32_2d(<8 x i32> inreg %rsrc, i32 %data, i32 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -1449,7 +1449,7 @@ define amdgpu_ps float @atomic_add_i32_3d(<8 x i32> inreg %rsrc, i32 %data, i32 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.3d.i32.i32(i32 %data, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0) @@ -1542,7 +1542,7 @@ define amdgpu_ps float @atomic_add_i32_cube(<8 x i32> inreg %rsrc, i32 %data, i3 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.cube.i32.i32(i32 %data, i32 %s, i32 %t, i32 %face, <8 x i32> %rsrc, i32 0, i32 0) @@ -1634,7 +1634,7 @@ define amdgpu_ps float @atomic_add_i32_1darray(<8 x i32> inreg %rsrc, i32 %data, ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.1darray.i32.i32(i32 %data, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -1727,7 +1727,7 @@ define amdgpu_ps float @atomic_add_i32_2darray(<8 x i32> inreg %rsrc, i32 %data, ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.2darray.i32.i32(i32 %data, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -1820,7 +1820,7 @@ define amdgpu_ps float @atomic_add_i32_2dmsaa(<8 x i32> inreg %rsrc, i32 %data, ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.2dmsaa.i32.i32(i32 %data, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -1914,7 +1914,7 @@ define amdgpu_ps float @atomic_add_i32_2darraymsaa(<8 x i32> inreg %rsrc, i32 %d ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2, v3, v4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.2darraymsaa.i32.i32(i32 %data, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -2005,7 +2005,7 @@ define amdgpu_ps float @atomic_add_i32_1d_slc(<8 x i32> inreg %rsrc, i32 %data, ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 2) @@ -2095,7 +2095,7 @@ define amdgpu_ps <2 x float> @atomic_swap_i64_1d(<8 x i32> inreg %rsrc, i64 %dat ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -2185,7 +2185,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_1d(<8 x i32> inreg %rsrc, i64 %data ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -2275,7 +2275,7 @@ define amdgpu_ps <2 x float> @atomic_sub_i64_1d(<8 x i32> inreg %rsrc, i64 %data ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_sub_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.sub.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -2365,7 +2365,7 @@ define amdgpu_ps <2 x float> @atomic_smin_i64_1d(<8 x i32> inreg %rsrc, i64 %dat ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_min_int v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.smin.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -2455,7 +2455,7 @@ define amdgpu_ps <2 x float> @atomic_umin_i64_1d(<8 x i32> inreg %rsrc, i64 %dat ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_min_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.umin.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -2545,7 +2545,7 @@ define amdgpu_ps <2 x float> @atomic_smax_i64_1d(<8 x i32> inreg %rsrc, i64 %dat ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_max_int v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.smax.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -2635,7 +2635,7 @@ define amdgpu_ps <2 x float> @atomic_umax_i64_1d(<8 x i32> inreg %rsrc, i64 %dat ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_max_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.umax.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -2725,7 +2725,7 @@ define amdgpu_ps <2 x float> @atomic_and_i64_1d(<8 x i32> inreg %rsrc, i64 %data ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_and v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.and.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -2815,7 +2815,7 @@ define amdgpu_ps <2 x float> @atomic_or_i64_1d(<8 x i32> inreg %rsrc, i64 %data, ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_or v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.or.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -2905,7 +2905,7 @@ define amdgpu_ps <2 x float> @atomic_xor_i64_1d(<8 x i32> inreg %rsrc, i64 %data ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_xor v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.xor.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -2995,7 +2995,7 @@ define amdgpu_ps <2 x float> @atomic_inc_i64_1d(<8 x i32> inreg %rsrc, i64 %data ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_inc_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.inc.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -3085,7 +3085,7 @@ define amdgpu_ps <2 x float> @atomic_dec_i64_1d(<8 x i32> inreg %rsrc, i64 %data ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_dec_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.dec.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -3175,7 +3175,7 @@ define amdgpu_ps <2 x float> @atomic_cmpswap_i64_1d(<8 x i32> inreg %rsrc, i64 % ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64 %cmp, i64 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -3348,7 +3348,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2d(<8 x i32> inreg %rsrc, i64 %data ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v[0:1], [v2, v3], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.2d.i64.i32(i64 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -3438,7 +3438,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_3d(<8 x i32> inreg %rsrc, i64 %data ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v[0:1], [v2, v3, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.3d.i64.i32(i64 %data, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0) @@ -3528,7 +3528,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_cube(<8 x i32> inreg %rsrc, i64 %da ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v[0:1], [v2, v3, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_CUBE th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.cube.i64.i32(i64 %data, i32 %s, i32 %t, i32 %face, <8 x i32> %rsrc, i32 0, i32 0) @@ -3618,7 +3618,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_1darray(<8 x i32> inreg %rsrc, i64 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v[0:1], [v2, v3], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D_ARRAY th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.1darray.i64.i32(i64 %data, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -3708,7 +3708,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2darray(<8 x i32> inreg %rsrc, i64 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v[0:1], [v2, v3, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_ARRAY th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.2darray.i64.i32(i64 %data, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -3798,7 +3798,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2dmsaa(<8 x i32> inreg %rsrc, i64 % ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v[0:1], [v2, v3, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.2dmsaa.i64.i32(i64 %data, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -3888,7 +3888,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2darraymsaa(<8 x i32> inreg %rsrc, ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v[0:1], [v2, v3, v4, v5], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.2darraymsaa.i64.i32(i64 %data, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -3978,7 +3978,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_1d_slc(<8 x i32> inreg %rsrc, i64 % ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_atomic_add_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 2) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll index b3b143372481e6..916b9c0835d41c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll @@ -71,7 +71,7 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX12-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f16(i32 1, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -145,7 +145,7 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX12-NEXT: image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.cube.v4f32.f16(i32 1, half %s, half %t, half %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -219,7 +219,7 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX12-NEXT: image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.2darray.v4f32.f16(i32 1, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -293,7 +293,7 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX12-NEXT: image_gather4_c v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f16(i32 1, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -367,7 +367,7 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX12-NEXT: image_gather4_cl v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f16(i32 1, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -443,7 +443,7 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX12-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX12-NEXT: image_gather4_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.2d.v4f32.f16(i32 1, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -517,7 +517,7 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX12-NEXT: image_gather4_b v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f16.f16(i32 1, half %bias, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -591,7 +591,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX12-NEXT: image_gather4_c_b v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f16.f16(i32 1, half %bias, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -667,7 +667,7 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX12-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX12-NEXT: image_gather4_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f16.f16(i32 1, half %bias, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -743,7 +743,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX12-NEXT: image_gather4_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f16.f16(i32 1, half %bias, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -808,7 +808,7 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12-NEXT: s_mov_b32 s10, s12 ; GFX12-NEXT: s_mov_b32 s11, s13 ; GFX12-NEXT: image_gather4_l v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f16(i32 1, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -875,7 +875,7 @@ define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: s_mov_b32 s10, s12 ; GFX12-NEXT: s_mov_b32 s11, s13 ; GFX12-NEXT: image_gather4_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f16(i32 1, float %zcompare, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -940,7 +940,7 @@ define amdgpu_ps <4 x float> @gather4_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-NEXT: s_mov_b32 s10, s12 ; GFX12-NEXT: s_mov_b32 s11, s13 ; GFX12-NEXT: image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 1, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -1005,7 +1005,7 @@ define amdgpu_ps <4 x float> @gather4_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX12-NEXT: s_mov_b32 s10, s12 ; GFX12-NEXT: s_mov_b32 s11, s13 ; GFX12-NEXT: image_gather4_c_lz v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f16(i32 1, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll index e16c4a66e69914..841f4f1ac055ee 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll @@ -65,7 +65,7 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: s_mov_b32 s10, s12 ; GFX12-NEXT: s_mov_b32 s11, s13 ; GFX12-NEXT: image_gather4 v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -154,7 +154,7 @@ define amdgpu_ps <4 x float> @gather4_2d_tfe(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: v_mov_b32_e32 v4, v0 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX12-NEXT: image_gather4 v[0:4], [v5, v6], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call { <4 x float>, i32 } @llvm.amdgcn.image.gather4.2d.sl_v4f32i32s.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0) @@ -223,7 +223,7 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12-NEXT: s_mov_b32 s10, s12 ; GFX12-NEXT: s_mov_b32 s11, s13 ; GFX12-NEXT: image_gather4 v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.cube.v4f32.f32(i32 1, float %s, float %t, float %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -291,7 +291,7 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX12-NEXT: s_mov_b32 s10, s12 ; GFX12-NEXT: s_mov_b32 s11, s13 ; GFX12-NEXT: image_gather4 v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.2darray.v4f32.f32(i32 1, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -359,7 +359,7 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12-NEXT: s_mov_b32 s10, s12 ; GFX12-NEXT: s_mov_b32 s11, s13 ; GFX12-NEXT: image_gather4_c v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -427,7 +427,7 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-NEXT: s_mov_b32 s10, s12 ; GFX12-NEXT: s_mov_b32 s11, s13 ; GFX12-NEXT: image_gather4_cl v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 1, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -495,7 +495,7 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX12-NEXT: s_mov_b32 s10, s12 ; GFX12-NEXT: s_mov_b32 s11, s13 ; GFX12-NEXT: image_gather4_c_cl v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -563,7 +563,7 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12-NEXT: s_mov_b32 s10, s12 ; GFX12-NEXT: s_mov_b32 s11, s13 ; GFX12-NEXT: image_gather4_b v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -631,7 +631,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: s_mov_b32 s10, s12 ; GFX12-NEXT: s_mov_b32 s11, s13 ; GFX12-NEXT: image_gather4_c_b v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -699,7 +699,7 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX12-NEXT: s_mov_b32 s10, s12 ; GFX12-NEXT: s_mov_b32 s11, s13 ; GFX12-NEXT: image_gather4_b_cl v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -767,7 +767,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-NEXT: s_mov_b32 s10, s12 ; GFX12-NEXT: s_mov_b32 s11, s13 ; GFX12-NEXT: image_gather4_c_b_cl v[0:3], [v0, v1, v2, v[3:4]], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -826,7 +826,7 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12-NEXT: s_mov_b32 s10, s12 ; GFX12-NEXT: s_mov_b32 s11, s13 ; GFX12-NEXT: image_gather4_l v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 1, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -885,7 +885,7 @@ define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: s_mov_b32 s10, s12 ; GFX12-NEXT: s_mov_b32 s11, s13 ; GFX12-NEXT: image_gather4_c_l v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -944,7 +944,7 @@ define amdgpu_ps <4 x float> @gather4_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-NEXT: s_mov_b32 s10, s12 ; GFX12-NEXT: s_mov_b32 s11, s13 ; GFX12-NEXT: image_gather4_lz v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -1003,7 +1003,7 @@ define amdgpu_ps <4 x float> @gather4_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX12-NEXT: s_mov_b32 s10, s12 ; GFX12-NEXT: s_mov_b32 s11, s13 ; GFX12-NEXT: image_gather4_c_lz v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -1071,7 +1071,7 @@ define amdgpu_ps <4 x float> @gather4_2d_dmask_2(<8 x i32> inreg %rsrc, <4 x i32 ; GFX12-NEXT: s_mov_b32 s10, s12 ; GFX12-NEXT: s_mov_b32 s11, s13 ; GFX12-NEXT: image_gather4 v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x2 dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 2, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -1139,7 +1139,7 @@ define amdgpu_ps <4 x float> @gather4_2d_dmask_4(<8 x i32> inreg %rsrc, <4 x i32 ; GFX12-NEXT: s_mov_b32 s10, s12 ; GFX12-NEXT: s_mov_b32 s11, s13 ; GFX12-NEXT: image_gather4 v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 4, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -1207,7 +1207,7 @@ define amdgpu_ps <4 x float> @gather4_2d_dmask_8(<8 x i32> inreg %rsrc, <4 x i32 ; GFX12-NEXT: s_mov_b32 s10, s12 ; GFX12-NEXT: s_mov_b32 s11, s13 ; GFX12-NEXT: image_gather4 v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x8 dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 8, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.a16.ll index 0b6d66b02afd9e..d1a36d552e21b5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.a16.ll @@ -44,7 +44,7 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, i16 %mip) { ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -91,7 +91,7 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, i16 %mip) { ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2d.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -138,7 +138,7 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, i16 %mip) { ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.3d.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -185,7 +185,7 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, i16 %mip) { ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.cube.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -232,7 +232,7 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, i16 %mip ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1darray.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -279,7 +279,7 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, i16 %mip ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darray.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -326,7 +326,7 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, i16 %mip) ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2dmsaa.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -373,7 +373,7 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, i16 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darraymsaa.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.ll index 23be1ca05d948a..496f9f428580b6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.ll @@ -59,7 +59,7 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, i32 %mip) { ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -120,7 +120,7 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, i32 %mip) { ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -181,7 +181,7 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, i32 %mip) { ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.3d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -242,7 +242,7 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, i32 %mip) { ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.cube.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -303,7 +303,7 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, i32 %mip ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1darray.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -364,7 +364,7 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, i32 %mip ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darray.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -425,7 +425,7 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, i32 %mip) ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2dmsaa.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -486,7 +486,7 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, i32 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darraymsaa.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -547,7 +547,7 @@ define amdgpu_ps <3 x float> @getresinfo_dmask7(<8 x i32> inreg %rsrc, <4 x floa ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_get_resinfo v[0:2], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %r = call <3 x float> @llvm.amdgcn.image.getresinfo.1d.v3f32.i32(i32 7, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -608,7 +608,7 @@ define amdgpu_ps <2 x float> @getresinfo_dmask3(<8 x i32> inreg %rsrc, <4 x floa ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_get_resinfo v[0:1], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %r = call <2 x float> @llvm.amdgcn.image.getresinfo.1d.v2f32.i32(i32 3, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -669,7 +669,7 @@ define amdgpu_ps float @getresinfo_dmask1(<8 x i32> inreg %rsrc, <4 x float> %vd ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_get_resinfo v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %r = call float @llvm.amdgcn.image.getresinfo.1d.f32.i32(i32 1, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll index d3434e922752d9..19b0057d69b69d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll @@ -74,7 +74,7 @@ define amdgpu_ps half @load_1d_f16_x(<8 x i32> inreg %rsrc, i32 %s) { ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D d16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %v = call half @llvm.amdgcn.image.load.1d.half.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret half %v @@ -148,7 +148,7 @@ define amdgpu_ps half @load_1d_f16_y(<8 x i32> inreg %rsrc, i32 %s) { ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_load v0, v0, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D d16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %v = call half @llvm.amdgcn.image.load.1d.half.i32(i32 2, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret half %v @@ -222,7 +222,7 @@ define amdgpu_ps half @load_1d_f16_z(<8 x i32> inreg %rsrc, i32 %s) { ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_load v0, v0, s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_1D d16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %v = call half @llvm.amdgcn.image.load.1d.half.i32(i32 4, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret half %v @@ -296,7 +296,7 @@ define amdgpu_ps half @load_1d_f16_w(<8 x i32> inreg %rsrc, i32 %s) { ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_load v0, v0, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D d16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %v = call half @llvm.amdgcn.image.load.1d.half.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret half %v @@ -373,7 +373,7 @@ define amdgpu_ps <2 x half> @load_1d_v2f16_xy(<8 x i32> inreg %rsrc, i32 %s) { ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_load v0, v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D d16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %v = call <2 x half> @llvm.amdgcn.image.load.1d.v2f16.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <2 x half> %v @@ -450,7 +450,7 @@ define amdgpu_ps <2 x half> @load_1d_v2f16_xz(<8 x i32> inreg %rsrc, i32 %s) { ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_load v0, v0, s[0:7] dmask:0x5 dim:SQ_RSRC_IMG_1D d16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %v = call <2 x half> @llvm.amdgcn.image.load.1d.v2f16.i32(i32 5, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <2 x half> %v @@ -527,7 +527,7 @@ define amdgpu_ps <2 x half> @load_1d_v2f16_xw(<8 x i32> inreg %rsrc, i32 %s) { ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_load v0, v0, s[0:7] dmask:0x9 dim:SQ_RSRC_IMG_1D d16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %v = call <2 x half> @llvm.amdgcn.image.load.1d.v2f16.i32(i32 9, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <2 x half> %v @@ -604,7 +604,7 @@ define amdgpu_ps <2 x half> @load_1d_v2f16_yz(<8 x i32> inreg %rsrc, i32 %s) { ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_load v0, v0, s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_1D d16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %v = call <2 x half> @llvm.amdgcn.image.load.1d.v2f16.i32(i32 6, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <2 x half> %v @@ -683,7 +683,7 @@ define amdgpu_ps <3 x half> @load_1d_v3f16_xyz(<8 x i32> inreg %rsrc, i32 %s) { ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D d16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %v = call <3 x half> @llvm.amdgcn.image.load.1d.v3f16.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <3 x half> %v @@ -763,7 +763,7 @@ define amdgpu_ps <4 x half> @load_1d_v4f16_xyzw(<8 x i32> inreg %rsrc, i32 %s) { ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_load v[0:1], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D d16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %v = call <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x half> %v @@ -851,7 +851,7 @@ define amdgpu_ps float @load_1d_f16_tfe_dmask_x(<8 x i32> inreg %rsrc, i32 %s) { ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D tfe d16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, v2 ; GFX12-NEXT: ; return to shader part epilog @@ -944,7 +944,7 @@ define amdgpu_ps float @load_1d_v2f16_tfe_dmask_xy(<8 x i32> inreg %rsrc, i32 %s ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe d16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, v2 ; GFX12-NEXT: ; return to shader part epilog @@ -1041,7 +1041,7 @@ define amdgpu_ps float @load_1d_v3f16_tfe_dmask_xyz(<8 x i32> inreg %rsrc, i32 % ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: image_load v[1:3], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D tfe d16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: ; return to shader part epilog @@ -1133,7 +1133,7 @@ define amdgpu_ps float @load_1d_v4f16_tfe_dmask_xyzw(<8 x i32> inreg %rsrc, i32 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x10 dim:SQ_RSRC_IMG_1D tfe d16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, v2 ; GFX12-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll index 1cdcfba3d5fc3a..ecf81f633761d7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll @@ -60,7 +60,7 @@ define amdgpu_ps float @load_1d_f32_x(<8 x i32> inreg %rsrc, i32 %s) { ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %v = call float @llvm.amdgcn.image.load.1d.f32.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret float %v @@ -120,7 +120,7 @@ define amdgpu_ps float @load_1d_f32_y(<8 x i32> inreg %rsrc, i32 %s) { ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_load v0, v0, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %v = call float @llvm.amdgcn.image.load.1d.f32.i32(i32 2, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret float %v @@ -180,7 +180,7 @@ define amdgpu_ps float @load_1d_f32_z(<8 x i32> inreg %rsrc, i32 %s) { ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_load v0, v0, s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %v = call float @llvm.amdgcn.image.load.1d.f32.i32(i32 4, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret float %v @@ -240,7 +240,7 @@ define amdgpu_ps float @load_1d_f32_w(<8 x i32> inreg %rsrc, i32 %s) { ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_load v0, v0, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %v = call float @llvm.amdgcn.image.load.1d.f32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret float %v @@ -300,7 +300,7 @@ define amdgpu_ps <2 x float> @load_1d_v2f32_xy(<8 x i32> inreg %rsrc, i32 %s) { ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <2 x float> %v @@ -360,7 +360,7 @@ define amdgpu_ps <2 x float> @load_1d_v2f32_xz(<8 x i32> inreg %rsrc, i32 %s) { ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x5 dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 5, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <2 x float> %v @@ -420,7 +420,7 @@ define amdgpu_ps <2 x float> @load_1d_v2f32_xw(<8 x i32> inreg %rsrc, i32 %s) { ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x9 dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 9, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <2 x float> %v @@ -480,7 +480,7 @@ define amdgpu_ps <2 x float> @load_1d_v2f32_yz(<8 x i32> inreg %rsrc, i32 %s) { ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 6, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <2 x float> %v @@ -540,7 +540,7 @@ define amdgpu_ps <3 x float> @load_1d_v3f32_xyz(<8 x i32> inreg %rsrc, i32 %s) { ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %v = call <3 x float> @llvm.amdgcn.image.load.1d.v3f32.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <3 x float> %v @@ -600,7 +600,7 @@ define amdgpu_ps <4 x float> @load_1d_v4f32_xyzw(<8 x i32> inreg %rsrc, i32 %s) ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -670,7 +670,7 @@ define amdgpu_ps float @load_1d_f32_tfe_dmask_x(<8 x i32> inreg %rsrc, i32 %s) { ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, v2 ; GFX12-NEXT: ; return to shader part epilog @@ -746,7 +746,7 @@ define amdgpu_ps float @load_1d_v2f32_tfe_dmask_xy(<8 x i32> inreg %rsrc, i32 %s ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: image_load v[1:3], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: ; return to shader part epilog @@ -825,7 +825,7 @@ define amdgpu_ps float @load_1d_v3f32_tfe_dmask_xyz(<8 x i32> inreg %rsrc, i32 % ; GFX12-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: v_mov_b32_e32 v4, v1 ; GFX12-NEXT: image_load v[1:4], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: ; return to shader part epilog @@ -899,7 +899,7 @@ define amdgpu_ps float @load_1d_v4f32_tfe_dmask_xyzw(<8 x i32> inreg %rsrc, i32 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x10 dim:SQ_RSRC_IMG_1D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, v2 ; GFX12-NEXT: ; return to shader part epilog @@ -973,7 +973,7 @@ define amdgpu_ps float @load_1d_f32_tfe_dmask_0(<8 x i32> inreg %rsrc, i32 %s) { ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, v2 ; GFX12-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll index e1f571d55be23b..fb4c92353cb99d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll @@ -44,7 +44,7 @@ define amdgpu_ps <4 x float> @load_2d_v4f32_xyzw(<8 x i32> inreg %rsrc, i32 %s, ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_load v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -149,7 +149,7 @@ define amdgpu_ps <4 x float> @load_2d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: v_mov_b32_e32 v4, v11 ; GFX12-NEXT: image_load v[0:4], [v5, v6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v7, v4, s[10:11] ; GFX12-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) @@ -258,7 +258,7 @@ define amdgpu_ps <4 x float> @load_2d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: v_mov_b32_e32 v4, v11 ; GFX12-NEXT: image_load v[0:4], [v5, v6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v7, v4, s[10:11] ; GFX12-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 3, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll index 4eb8c56382f8c8..2c8b8126aa09a4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll @@ -56,7 +56,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw(<8 x i32> inreg %rsrc, ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_load v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %v = call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -176,7 +176,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r ; GFX12-NEXT: v_mov_b32_e32 v3, v8 ; GFX12-NEXT: v_mov_b32_e32 v4, v9 ; GFX12-NEXT: image_load v[0:4], [v10, v11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY a16 tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v5, v4, s[10:11] ; GFX12-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 1, i32 0) @@ -300,7 +300,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre ; GFX12-NEXT: v_mov_b32_e32 v3, v8 ; GFX12-NEXT: v_mov_b32_e32 v4, v9 ; GFX12-NEXT: image_load v[0:4], [v10, v11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY a16 tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v5, v4, s[10:11] ; GFX12-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 3, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll index c7c216767b93bc..11ad98ac5fd1a4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll @@ -44,7 +44,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw(<8 x i32> inreg %rsrc, ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_load v[0:3], [v0, v1, v2, v3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %v = call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -155,7 +155,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r ; GFX12-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 ; GFX12-NEXT: v_mov_b32_e32 v4, v13 ; GFX12-NEXT: image_load v[0:4], [v5, v6, v7, v8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v9, v4, s[10:11] ; GFX12-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0) @@ -270,7 +270,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre ; GFX12-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 ; GFX12-NEXT: v_mov_b32_e32 v4, v13 ; GFX12-NEXT: image_load v[0:4], [v5, v6, v7, v8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v9, v4, s[10:11] ; GFX12-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll index ff93d1119df83e..f5d11fcdff80a7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll @@ -50,7 +50,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s, ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_load v[0:3], [v0, v2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -158,7 +158,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr ; GFX12-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 ; GFX12-NEXT: v_mov_b32_e32 v4, v10 ; GFX12-NEXT: image_load v[0:4], [v11, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v6, v4, s[10:11] ; GFX12-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 1, i32 0) @@ -270,7 +270,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ; GFX12-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 ; GFX12-NEXT: v_mov_b32_e32 v4, v10 ; GFX12-NEXT: image_load v[0:4], [v11, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v6, v4, s[10:11] ; GFX12-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 3, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll index 1285bb79e017f0..162a58613065f0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll @@ -44,7 +44,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i32 %s, ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_load v[0:3], [v0, v1, v2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -151,7 +151,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr ; GFX12-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 ; GFX12-NEXT: v_mov_b32_e32 v4, v12 ; GFX12-NEXT: image_load v[0:4], [v5, v6, v7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v8, v4, s[10:11] ; GFX12-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 1, i32 0) @@ -262,7 +262,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ; GFX12-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 ; GFX12-NEXT: v_mov_b32_e32 v4, v12 ; GFX12-NEXT: image_load v[0:4], [v5, v6, v7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v8, v4, s[10:11] ; GFX12-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll index 2b82d891afaf03..268d416d2020b8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll @@ -19,7 +19,7 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-LABEL: sample_d_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -54,7 +54,7 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX12-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; GFX12-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -91,7 +91,7 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX12-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; GFX12-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v1, v[5:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -114,7 +114,7 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-LABEL: sample_c_d_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -149,7 +149,7 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX12-NEXT: v_lshl_or_b32 v2, v4, 16, v3 ; GFX12-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v[5:6]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -172,7 +172,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-LABEL: sample_d_cl_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -207,7 +207,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX12-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; GFX12-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v4, v[5:6]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -230,7 +230,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-LABEL: sample_c_d_cl_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v[3:4]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -267,7 +267,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX12-NEXT: v_lshl_or_b32 v2, v4, 16, v3 ; GFX12-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v[5:7]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -306,7 +306,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX12-NEXT: v_lshl_or_b32 v5, v5, 16, v4 ; GFX12-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v[5:8]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32 4, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -345,7 +345,7 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 ; GFX12-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX12-NEXT: v_lshl_or_b32 v5, v5, 16, v4 ; GFX12-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v[5:8]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll index f495ce5390c727..0d25ee527ee80a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll @@ -20,7 +20,11 @@ define float @v_rsq_clamp_f32(float %src) #0 { ; ; GFX12-LABEL: v_rsq_clamp_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_rsq_f32_e32 v0, v0 ; GFX12-NEXT: v_mov_b32_e32 v1, 0xff7fffff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) @@ -47,7 +51,11 @@ define float @v_rsq_clamp_fabs_f32(float %src) #0 { ; ; GFX12-LABEL: v_rsq_clamp_fabs_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_rsq_f32_e64 v0, |v0| ; GFX12-NEXT: v_mov_b32_e32 v1, 0xff7fffff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) @@ -78,7 +86,11 @@ define double @v_rsq_clamp_f64(double %src) #0 { ; ; GFX12-LABEL: v_rsq_clamp_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_rsq_f64_e32 v[0:1], v[0:1] ; GFX12-NEXT: s_mov_b32 s0, -1 ; GFX12-NEXT: s_mov_b32 s1, 0x7fefffff @@ -112,7 +124,11 @@ define double @v_rsq_clamp_fabs_f64(double %src) #0 { ; ; GFX12-LABEL: v_rsq_clamp_fabs_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_rsq_f64_e64 v[0:1], |v[0:1]| ; GFX12-NEXT: s_mov_b32 s0, -1 ; GFX12-NEXT: s_mov_b32 s1, 0x7fefffff @@ -144,7 +160,11 @@ define float @v_rsq_clamp_undef_f32() #0 { ; ; GFX12-LABEL: v_rsq_clamp_undef_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_s_rsq_f32 s0, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0xff7fffff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) @@ -174,7 +194,11 @@ define double @v_rsq_clamp_undef_f64() #0 { ; ; GFX12-LABEL: v_rsq_clamp_undef_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_rsq_f64_e32 v[0:1], s[0:1] ; GFX12-NEXT: s_mov_b32 s0, -1 ; GFX12-NEXT: s_mov_b32 s1, 0x7fefffff @@ -205,7 +229,11 @@ define float @v_rsq_clamp_f32_non_ieee(float %src) #2 { ; ; GFX12-LABEL: v_rsq_clamp_f32_non_ieee: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_rsq_f32_e32 v0, v0 ; GFX12-NEXT: v_mov_b32_e32 v1, 0xff7fffff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) @@ -235,7 +263,11 @@ define double @v_rsq_clamp_f64_non_ieee(double %src) #2 { ; ; GFX12-LABEL: v_rsq_clamp_f64_non_ieee: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_rsq_f64_e32 v[0:1], v[0:1] ; GFX12-NEXT: s_mov_b32 s0, -1 ; GFX12-NEXT: s_mov_b32 s1, 0x7fefffff diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll index 4853bb309c1bb6..13fb5b7b9bfd3e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll @@ -11,14 +11,22 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) { ; GFX12-UNALIGNED-LABEL: v_load_constant_v3i32_align1: ; GFX12-UNALIGNED: ; %bb.0: -; GFX12-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_expcnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_samplecnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_bvhcnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0 ; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off -; GFX12-UNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0 ; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-NOUNALIGNED-LABEL: v_load_constant_v3i32_align1: ; GFX12-NOUNALIGNED: ; %bb.0: -; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_expcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_samplecnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0 ; GFX12-NOUNALIGNED-NEXT: s_clause 0xb ; GFX12-NOUNALIGNED-NEXT: global_load_u8 v2, v[0:1], off ; GFX12-NOUNALIGNED-NEXT: global_load_u8 v3, v[0:1], off offset:1 @@ -32,23 +40,23 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) { ; GFX12-NOUNALIGNED-NEXT: global_load_u8 v11, v[0:1], off offset:9 ; GFX12-NOUNALIGNED-NEXT: global_load_u8 v12, v[0:1], off offset:11 ; GFX12-NOUNALIGNED-NEXT: global_load_u8 v0, v[0:1], off offset:10 -; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0xa ; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 -; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x9 ; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x8 ; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v5 -; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x6 ; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v4, v7, 8, v6 -; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x5 ; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x4 ; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 24, v9 -; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x2 ; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v7, v11, 8, v10 -; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x1 ; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v12 -; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0 ; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 16, v0 ; GFX12-NOUNALIGNED-NEXT: v_or3_b32 v0, v2, v3, v1 ; GFX12-NOUNALIGNED-NEXT: v_or3_b32 v1, v5, v6, v4 @@ -217,14 +225,22 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) { define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) { ; GFX12-UNALIGNED-LABEL: v_load_constant_v3i32_align2: ; GFX12-UNALIGNED: ; %bb.0: -; GFX12-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_expcnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_samplecnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_bvhcnt 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0 ; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off -; GFX12-UNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0 ; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2: ; GFX12-NOUNALIGNED: ; %bb.0: -; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_expcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_samplecnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0 ; GFX12-NOUNALIGNED-NEXT: s_clause 0x5 ; GFX12-NOUNALIGNED-NEXT: global_load_u16 v2, v[0:1], off ; GFX12-NOUNALIGNED-NEXT: global_load_u16 v3, v[0:1], off offset:2 @@ -232,11 +248,11 @@ define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) { ; GFX12-NOUNALIGNED-NEXT: global_load_u16 v5, v[0:1], off offset:6 ; GFX12-NOUNALIGNED-NEXT: global_load_u16 v6, v[0:1], off offset:8 ; GFX12-NOUNALIGNED-NEXT: global_load_u16 v7, v[0:1], off offset:10 -; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x4 ; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2 -; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x2 ; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v5, 16, v4 -; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0 ; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v2, v7, 16, v6 ; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] ; @@ -332,9 +348,13 @@ define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) { define <3 x i32> @v_load_constant_v3i32_align4(ptr addrspace(4) %ptr) { ; GFX12-LABEL: v_load_constant_v3i32_align4: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_load_constant_v3i32_align4: @@ -374,9 +394,13 @@ define <3 x i32> @v_load_constant_v3i32_align4(ptr addrspace(4) %ptr) { define i96 @v_load_constant_i96_align8(ptr addrspace(4) %ptr) { ; GFX12-LABEL: v_load_constant_i96_align8: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_load_constant_i96_align8: @@ -416,9 +440,13 @@ define i96 @v_load_constant_i96_align8(ptr addrspace(4) %ptr) { define <3 x i32> @v_load_constant_v3i32_align8(ptr addrspace(4) %ptr) { ; GFX12-LABEL: v_load_constant_v3i32_align8: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_load_constant_v3i32_align8: @@ -458,9 +486,13 @@ define <3 x i32> @v_load_constant_v3i32_align8(ptr addrspace(4) %ptr) { define <6 x i16> @v_load_constant_v6i16_align8(ptr addrspace(4) %ptr) { ; GFX12-LABEL: v_load_constant_v6i16_align8: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_load_constant_v6i16_align8: @@ -509,9 +541,13 @@ define <6 x i16> @v_load_constant_v6i16_align8(ptr addrspace(4) %ptr) { define <12 x i8> @v_load_constant_v12i8_align8(ptr addrspace(4) %ptr) { ; GFX12-LABEL: v_load_constant_v12i8_align8: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v13, 8, v0 ; GFX12-NEXT: v_lshrrev_b32_e32 v12, 16, v0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 24, v0 @@ -598,9 +634,13 @@ define <12 x i8> @v_load_constant_v12i8_align8(ptr addrspace(4) %ptr) { define <3 x i32> @v_load_constant_v3i32_align16(ptr addrspace(4) %ptr) { ; GFX12-LABEL: v_load_constant_v3i32_align16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_load_constant_v3i32_align16: @@ -638,7 +678,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg ; GFX12-UNALIGNED: ; %bb.0: ; GFX12-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v0, s[0:1] -; GFX12-UNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0 ; GFX12-UNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 ; GFX12-UNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 ; GFX12-UNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 @@ -660,24 +700,24 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg ; GFX12-NOUNALIGNED-NEXT: global_load_u8 v10, v0, s[0:1] offset:9 ; GFX12-NOUNALIGNED-NEXT: global_load_u8 v11, v0, s[0:1] offset:11 ; GFX12-NOUNALIGNED-NEXT: global_load_u8 v0, v0, s[0:1] offset:10 -; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0xa ; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v2, 8, v1 -; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x9 ; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x8 ; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v4 -; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x6 ; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v5 -; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x5 ; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x4 ; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 24, v8 ; GFX12-NOUNALIGNED-NEXT: v_or3_b32 v1, v2, v3, v1 -; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x2 ; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v7, v10, 8, v9 -; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x1 ; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v11 -; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0 ; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX12-NOUNALIGNED-NEXT: v_or3_b32 v2, v5, v6, v4 ; GFX12-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v1 @@ -860,7 +900,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg ; GFX12-UNALIGNED: ; %bb.0: ; GFX12-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v0, s[0:1] -; GFX12-UNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0 ; GFX12-UNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 ; GFX12-UNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 ; GFX12-UNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 @@ -876,11 +916,11 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg ; GFX12-NOUNALIGNED-NEXT: global_load_u16 v4, v0, s[0:1] offset:6 ; GFX12-NOUNALIGNED-NEXT: global_load_u16 v5, v0, s[0:1] offset:8 ; GFX12-NOUNALIGNED-NEXT: global_load_u16 v0, v0, s[0:1] offset:10 -; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x4 ; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x2 ; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v2, v4, 16, v3 -; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0 ; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v5 ; GFX12-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v1 ; GFX12-NOUNALIGNED-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) @@ -990,7 +1030,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg ; GFX12-LABEL: s_load_constant_v3i32_align4: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_load_constant_v3i32_align4: @@ -1027,7 +1067,7 @@ define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) { ; GFX12-LABEL: s_load_constant_i96_align8: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_load_constant_i96_align8: @@ -1064,7 +1104,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg ; GFX12-LABEL: s_load_constant_v3i32_align8: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_load_constant_v3i32_align8: @@ -1101,7 +1141,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg ; GFX12-LABEL: s_load_constant_v6i16_align8: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_load_constant_v6i16_align8: @@ -1139,7 +1179,7 @@ define amdgpu_ps <12 x i8> @s_load_constant_v12i8_align8(ptr addrspace(4) inreg ; GFX12-LABEL: s_load_constant_v12i8_align8: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshr_b32 s13, s0, 8 ; GFX12-NEXT: s_lshr_b32 s12, s0, 16 ; GFX12-NEXT: s_lshr_b32 s3, s0, 24 @@ -1216,7 +1256,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align16(ptr addrspace(4) inreg ; GFX12-LABEL: s_load_constant_v3i32_align16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog ; ; GCN-LABEL: s_load_constant_v3i32_align16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll index 904120e7d11894..a9f0e546eb35b6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll @@ -671,8 +671,8 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr(ptr addrspace(1) inreg %ptr) { ; GFX12-LABEL: mubuf_load_sgpr_ptr: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %val = load volatile float, ptr addrspace(1) %ptr ret float %val @@ -704,8 +704,8 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4095(ptr addrspace(1) inreg %p ; GFX12-LABEL: mubuf_load_sgpr_ptr_offset4095: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:16380 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:16380 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr float, ptr addrspace(1) %ptr, i64 4095 %val = load volatile float, ptr addrspace(1) %gep @@ -747,8 +747,8 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4294967296(ptr addrspace(1) in ; GFX12-NEXT: s_add_co_ci_u32 s1, s3, 4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: global_load_b32 v0, v[0:1], off th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr float, ptr addrspace(1) %ptr, i64 4294967296 %val = load volatile float, ptr addrspace(1) %gep @@ -790,8 +790,8 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4294967297(ptr addrspace(1) in ; GFX12-NEXT: s_add_co_ci_u32 s1, s3, 4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: global_load_b32 v0, v[0:1], off th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr float, ptr addrspace(1) %ptr, i64 4294967297 %val = load volatile float, ptr addrspace(1) %gep @@ -824,8 +824,8 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4096(ptr addrspace(1) inreg %p ; GFX12-LABEL: mubuf_load_sgpr_ptr_offset4096: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:16384 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:16384 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr float, ptr addrspace(1) %ptr, i64 4096 %val = load volatile float, ptr addrspace(1) %gep @@ -855,8 +855,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4095(ptr addrspace(1) %ptr) { ; ; GFX12-LABEL: mubuf_load_vgpr_ptr_offset4095: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:16380 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:16380 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr float, ptr addrspace(1) %ptr, i64 4095 %val = load volatile float, ptr addrspace(1) %gep @@ -893,8 +893,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967296(ptr addrspace(1) %p ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX12-NEXT: global_load_b32 v0, v[0:1], off th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr float, ptr addrspace(1) %ptr, i64 4294967296 %val = load volatile float, ptr addrspace(1) %gep @@ -931,8 +931,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967297(ptr addrspace(1) %p ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX12-NEXT: global_load_b32 v0, v[0:1], off th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr float, ptr addrspace(1) %ptr, i64 4294967297 %val = load volatile float, ptr addrspace(1) %gep @@ -962,8 +962,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4096(ptr addrspace(1) %ptr) { ; ; GFX12-LABEL: mubuf_load_vgpr_ptr_offset4096: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:16384 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:16384 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr float, ptr addrspace(1) %ptr, i64 4096 %val = load volatile float, ptr addrspace(1) %gep @@ -1007,8 +1007,8 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_sgpr_offset(ptr addrspace(1) inreg % ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_co_u32 s0, s2, s0 ; GFX12-NEXT: s_add_co_ci_u32 s1, s3, s1 -; GFX12-NEXT: global_load_b32 v0, v0, s[0:1] th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_load_b32 v0, v0, s[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr float, ptr addrspace(1) %ptr, i32 %soffset %val = load volatile float, ptr addrspace(1) %gep @@ -1045,8 +1045,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset(ptr addrspace(1) %ptr, i ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX12-NEXT: global_load_b32 v0, v[0:1], off th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr float, ptr addrspace(1) %ptr, i32 %soffset %val = load volatile float, ptr addrspace(1) %gep @@ -1083,8 +1083,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset_offset256(ptr addrspace( ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:1024 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:1024 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr float, ptr addrspace(1) %ptr, i32 %soffset %gep1 = getelementptr float, ptr addrspace(1) %gep0, i32 256 @@ -1122,8 +1122,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset256_offset(ptr addrspace( ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:1024 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:1024 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr float, ptr addrspace(1) %ptr, i64 256 %gep1 = getelementptr float, ptr addrspace(1) %gep0, i32 %soffset @@ -1165,8 +1165,8 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_vgpr_offset(ptr addrspace(1) inreg % ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo -; GFX12-NEXT: global_load_b32 v0, v[0:1], off th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr float, ptr addrspace(1) %ptr, i32 %voffset %val = load volatile float, ptr addrspace(1) %gep @@ -1209,8 +1209,8 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_vgpr_offset_offset4095(ptr addrspace ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo -; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:16380 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:16380 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr float, ptr addrspace(1) %ptr, i32 %voffset %gep1 = getelementptr float, ptr addrspace(1) %gep0, i64 4095 @@ -1253,8 +1253,8 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4095_vgpr_offset(ptr addrspace ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo -; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:16380 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:16380 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr float, ptr addrspace(1) %ptr, i64 4095 %gep1 = getelementptr float, ptr addrspace(1) %gep0, i32 %voffset @@ -1294,7 +1294,7 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4095(ptr addrspace(1) inr ; GFX12: ; %bb.0: ; GFX12-NEXT: v_dual_mov_b32 v0, 2 :: v_dual_mov_b32 v1, 0 ; GFX12-NEXT: global_atomic_add_u32 v0, v1, v0, s[2:3] offset:16380 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095 @@ -1345,7 +1345,7 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4294967296(ptr addrspace( ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, 2 ; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296 @@ -1386,7 +1386,7 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4095(ptr addrspace(1) %pt ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 2 ; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off offset:16380 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095 @@ -1434,7 +1434,7 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4294967296(ptr addrspace( ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-NEXT: v_mov_b32_e32 v2, 2 ; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296 @@ -1486,7 +1486,7 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_vgpr_offset(ptr addrspace(1) in ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo ; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v4, off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset @@ -1530,7 +1530,7 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4095(ptr addrspace(1) inreg ; GFX12-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v0, v[1:2], s[2:3] offset:16380 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095 @@ -1583,7 +1583,7 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4294967296(ptr addrspace(1) ; GFX12-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[3:4], v[1:2], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296 @@ -1625,7 +1625,7 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4095(ptr addrspace(1) %ptr, ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v4, v2 ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off offset:16380 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095 @@ -1673,7 +1673,7 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4294967296(ptr addrspace(1) ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v5 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v6, vcc_lo ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296 @@ -1726,7 +1726,7 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_vgpr_offset(ptr addrspace(1) inre ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 0840f58ecd1a61..12d00f8cfd9c96 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -73,7 +73,11 @@ define i16 @v_mul_i16(i16 %num, i16 %den) { ; ; GFX12-LABEL: v_mul_i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = mul i16 %num, %den @@ -152,7 +156,11 @@ define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) { ; ; GFX12-LABEL: v_mul_i16_zeroext: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -237,7 +245,11 @@ define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) { ; ; GFX12-LABEL: v_mul_i16_signext: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16 @@ -280,7 +292,11 @@ define i32 @v_mul_i32(i32 %num, i32 %den) { ; ; GFX12-LABEL: v_mul_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mul_lo_u32 v0, v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = mul i32 %num, %den @@ -326,7 +342,11 @@ define <2 x i32> @v_mul_v2i32(<2 x i32> %num, <2 x i32> %den) { ; ; GFX12-LABEL: v_mul_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2 ; GFX12-NEXT: v_mul_lo_u32 v1, v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -480,7 +500,11 @@ define i64 @v_mul_i64(i64 %num, i64 %den) { ; ; GFX12-LABEL: v_mul_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mul_hi_u32 v4, v0, v2 ; GFX12-NEXT: v_mul_lo_u32 v3, v0, v3 ; GFX12-NEXT: v_mul_lo_u32 v1, v1, v2 @@ -653,7 +677,11 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; ; GFX12-LABEL: v_mul_i96: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1 ; GFX12-NEXT: v_mul_lo_u32 v2, v2, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) @@ -989,7 +1017,11 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; ; GFX12-LABEL: v_mul_i128: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1 ; GFX12-NEXT: v_mov_b32_e32 v10, v2 ; GFX12-NEXT: v_mul_lo_u32 v3, v3, v4 @@ -2352,7 +2384,11 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; ; GFX12-LABEL: v_mul_i256: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 ; GFX12-NEXT: v_mul_lo_u32 v27, v6, v9 ; GFX12-NEXT: v_mul_lo_u32 v7, v7, v8 @@ -2496,7 +2532,7 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX12-LABEL: s_mul_u64_zext_with_vregs: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b32 v2, v[2:3], off -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX12-NEXT: s_nop 0 @@ -2591,10 +2627,10 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_mov_b32 s3, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2673,7 +2709,7 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX12-LABEL: s_mul_u64_sext_with_vregs: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b32 v2, v[2:3], off -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX12-NEXT: s_nop 0 @@ -2783,9 +2819,9 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_ashr_i32 s3, s2, 31 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50 diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll index 34a676bffcfe31..78add099fc61b9 100644 --- a/llvm/test/CodeGen/AMDGPU/add.ll +++ b/llvm/test/CodeGen/AMDGPU/add.ll @@ -74,9 +74,9 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; GFX12-LABEL: s_add_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_co_i32 s2, s2, s3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -170,9 +170,9 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX12-LABEL: s_add_v2i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_co_i32 s2, s4, s6 ; GFX12-NEXT: s_add_co_i32 s3, s5, s7 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -287,9 +287,9 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX12-LABEL: s_add_v4i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_co_i32 s3, s3, s7 ; GFX12-NEXT: s_add_co_i32 s2, s2, s6 ; GFX12-NEXT: s_add_co_i32 s0, s0, s4 @@ -454,7 +454,7 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b512 s[4:19], s[0:1], 0x44 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_co_i32 s2, s7, s15 ; GFX12-NEXT: s_add_co_i32 s3, s6, s14 ; GFX12-NEXT: s_add_co_i32 s6, s11, s19 @@ -728,7 +728,7 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1 ; GFX12-NEXT: s_load_b512 s[4:19], s[0:1], 0x64 ; GFX12-NEXT: s_load_b512 s[36:51], s[0:1], 0xa4 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_co_i32 s2, s7, s39 ; GFX12-NEXT: s_add_co_i32 s3, s6, s38 ; GFX12-NEXT: s_add_co_i32 s6, s11, s43 @@ -859,11 +859,11 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) -; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_nc_u32_e32 v0, v1, v0 ; GFX12-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -956,9 +956,9 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_nc_u32_e32 v0, 0x7b, v0 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1051,7 +1051,7 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1] ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -1157,9 +1157,9 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -1308,7 +1308,7 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX12-LABEL: add64_in_branch: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX12-NEXT: s_cbranch_scc0 .LBB9_4 ; GFX12-NEXT: ; %bb.1: ; %else @@ -1318,7 +1318,7 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX12-NEXT: .LBB9_2: ; %if ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 ; GFX12-NEXT: .LBB9_3: ; %endif -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll index e3d2ecefbda30d..12434fa0375139 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -234,17 +234,17 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB0_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W64-NEXT: s_waitcnt vmcnt(0) +; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -265,17 +265,17 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12W32-NEXT: s_mul_i32 s3, s3, 5 ; GFX12W32-NEXT: v_mov_b32_e32 v1, s3 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB0_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W32-NEXT: s_waitcnt vmcnt(0) +; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -512,7 +512,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: s_mul_i32 s4, s6, s4 ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 @@ -520,9 +520,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: .LBB1_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0 -; GFX12W64-NEXT: s_waitcnt vmcnt(0) +; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -545,7 +545,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 @@ -553,9 +553,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: .LBB1_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0 -; GFX12W32-NEXT: s_waitcnt vmcnt(0) +; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 @@ -858,17 +858,17 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB2_4: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W64-NEXT: s_waitcnt vmcnt(0) +; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -901,16 +901,16 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; %bb.3: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB2_4: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W32-NEXT: s_waitcnt vmcnt(0) +; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1231,18 +1231,18 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W64-NEXT: s_load_b32 s5, s[0:1], 0x44 ; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mov_b32_e32 v2, s5 ; GFX12W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB3_4: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W64-NEXT: s_waitcnt vmcnt(0) +; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1277,17 +1277,17 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W32-NEXT: s_load_b32 s8, s[0:1], 0x44 ; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: v_mov_b32_e32 v2, s8 ; GFX12W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB3_4: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W32-NEXT: s_waitcnt vmcnt(0) +; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1369,10 +1369,10 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v1, 1 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1609,18 +1609,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB5_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W64-NEXT: s_waitcnt vmcnt(0) +; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1641,18 +1641,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12W32-NEXT: s_mul_i32 s3, s3, 5 ; GFX12W32-NEXT: v_mov_b32_e32 v1, s3 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB5_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W32-NEXT: s_waitcnt vmcnt(0) +; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1893,7 +1893,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: s_mul_i32 s4, s6, s4 ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 @@ -1901,9 +1901,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: .LBB6_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0 -; GFX12W64-NEXT: s_waitcnt vmcnt(0) +; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -1926,7 +1926,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 @@ -1934,9 +1934,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: .LBB6_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0 -; GFX12W32-NEXT: s_waitcnt vmcnt(0) +; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -2241,17 +2241,17 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB7_4: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W64-NEXT: s_waitcnt vmcnt(0) +; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2284,17 +2284,17 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; %bb.3: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB7_4: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W32-NEXT: s_waitcnt vmcnt(0) +; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2376,10 +2376,10 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v1, 1 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index b4c8da44337ae5..01e615d64083a0 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -235,15 +235,15 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: s_mul_i32 s6, s6, 5 ; GFX1264-NEXT: s_mov_b32 s10, -1 ; GFX1264-NEXT: v_mov_b32_e32 v1, s6 -; GFX1264-NEXT: s_waitcnt lgkmcnt(0) +; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mov_b32 s8, s2 ; GFX1264-NEXT: s_mov_b32 s9, s3 ; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1264-NEXT: s_waitcnt vmcnt(0) +; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB0_2: ; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1264-NEXT: s_waitcnt lgkmcnt(0) +; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -270,15 +270,15 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: s_mul_i32 s5, s5, 5 ; GFX1232-NEXT: s_mov_b32 s10, -1 ; GFX1232-NEXT: v_mov_b32_e32 v1, s5 -; GFX1232-NEXT: s_waitcnt lgkmcnt(0) +; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s8, s2 ; GFX1232-NEXT: s_mov_b32 s9, s3 ; GFX1232-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1232-NEXT: s_waitcnt vmcnt(0) +; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB0_2: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1232-NEXT: s_waitcnt lgkmcnt(0) +; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -561,18 +561,18 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: ; %bb.1: ; GFX1264-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1264-NEXT: s_mov_b32 s15, 0x31016000 -; GFX1264-NEXT: s_waitcnt lgkmcnt(0) +; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mul_i32 s2, s8, s2 ; GFX1264-NEXT: s_mov_b32 s14, -1 ; GFX1264-NEXT: v_mov_b32_e32 v1, s2 ; GFX1264-NEXT: s_mov_b32 s12, s6 ; GFX1264-NEXT: s_mov_b32 s13, s7 ; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[12:15], null th:TH_ATOMIC_RETURN -; GFX1264-NEXT: s_waitcnt vmcnt(0) +; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB1_2: ; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1264-NEXT: s_waitcnt lgkmcnt(0) +; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX1264-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 @@ -599,18 +599,18 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: ; %bb.1: ; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1232-NEXT: s_waitcnt lgkmcnt(0) +; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mul_i32 s2, s0, s2 ; GFX1232-NEXT: s_mov_b32 s10, -1 ; GFX1232-NEXT: v_mov_b32_e32 v1, s2 ; GFX1232-NEXT: s_mov_b32 s8, s6 ; GFX1232-NEXT: s_mov_b32 s9, s7 ; GFX1232-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1232-NEXT: s_waitcnt vmcnt(0) +; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB1_2: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1232-NEXT: s_waitcnt lgkmcnt(0) +; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX1232-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 @@ -958,15 +958,15 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: v_mov_b32_e32 v0, s6 ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264-NEXT: s_mov_b32 s10, -1 -; GFX1264-NEXT: s_waitcnt lgkmcnt(0) +; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mov_b32 s8, s2 ; GFX1264-NEXT: s_mov_b32 s9, s3 ; GFX1264-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1264-NEXT: s_waitcnt vmcnt(0) +; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB2_4: ; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1264-NEXT: s_waitcnt lgkmcnt(0) +; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1006,15 +1006,15 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: v_mov_b32_e32 v0, s4 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232-NEXT: s_mov_b32 s10, -1 -; GFX1232-NEXT: s_waitcnt lgkmcnt(0) +; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s8, s2 ; GFX1232-NEXT: s_mov_b32 s9, s3 ; GFX1232-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1232-NEXT: s_waitcnt vmcnt(0) +; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB2_4: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX1232-NEXT: s_waitcnt lgkmcnt(0) +; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1274,15 +1274,15 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: s_mov_b32 s10, -1 ; GFX1264-NEXT: v_mov_b32_e32 v0, s6 ; GFX1264-NEXT: v_mov_b32_e32 v1, s7 -; GFX1264-NEXT: s_waitcnt lgkmcnt(0) +; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mov_b32 s8, s2 ; GFX1264-NEXT: s_mov_b32 s9, s3 ; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1264-NEXT: s_waitcnt vmcnt(0) +; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB3_2: ; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1264-NEXT: s_waitcnt lgkmcnt(0) +; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1311,15 +1311,15 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5 ; GFX1232-NEXT: s_mov_b32 s10, -1 ; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX1232-NEXT: s_waitcnt lgkmcnt(0) +; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s8, s2 ; GFX1232-NEXT: s_mov_b32 s9, s3 ; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1232-NEXT: s_waitcnt vmcnt(0) +; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB3_2: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; GFX1232-NEXT: s_waitcnt lgkmcnt(0) +; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1655,7 +1655,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_cbranch_execz .LBB4_2 ; GFX1264-NEXT: ; %bb.1: ; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[8:9] -; GFX1264-NEXT: s_waitcnt lgkmcnt(0) +; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mul_u64 s[8:9], s[0:1], s[10:11] ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264-NEXT: v_mov_b32_e32 v0, s8 @@ -1664,13 +1664,13 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_mov_b32 s8, s6 ; GFX1264-NEXT: s_mov_b32 s9, s7 ; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1264-NEXT: s_waitcnt vmcnt(0) +; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB4_2: ; GFX1264-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1264-NEXT: s_waitcnt lgkmcnt(0) +; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_mul_lo_u32 v3, s1, v2 ; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1264-NEXT: s_mov_b32 s6, -1 @@ -1698,20 +1698,20 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: ; %bb.1: ; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1232-NEXT: s_mov_b32 s15, 0x31016000 -; GFX1232-NEXT: s_waitcnt lgkmcnt(0) +; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mul_u64 s[2:3], s[0:1], s[2:3] ; GFX1232-NEXT: s_mov_b32 s14, -1 ; GFX1232-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX1232-NEXT: s_mov_b32 s12, s6 ; GFX1232-NEXT: s_mov_b32 s13, s7 ; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN -; GFX1232-NEXT: s_waitcnt vmcnt(0) +; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB4_2: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1232-NEXT: s_waitcnt lgkmcnt(0) +; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_mul_lo_u32 v3, s1, v2 ; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1232-NEXT: s_mov_b32 s6, -1 @@ -1817,12 +1817,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s11, s7 ; GFX12-NEXT: s_mov_b32 s10, s6 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s8, s2 ; GFX12-NEXT: s_mov_b32 s9, s3 ; GFX12-NEXT: s_mov_b32 s4, s0 ; GFX12-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_mov_b32 s5, s1 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null @@ -2098,15 +2098,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: s_mul_i32 s6, s6, 5 ; GFX1264-NEXT: s_mov_b32 s10, -1 ; GFX1264-NEXT: v_mov_b32_e32 v1, s6 -; GFX1264-NEXT: s_waitcnt lgkmcnt(0) +; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mov_b32 s8, s2 ; GFX1264-NEXT: s_mov_b32 s9, s3 ; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1264-NEXT: s_waitcnt vmcnt(0) +; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB6_2: ; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1264-NEXT: s_waitcnt lgkmcnt(0) +; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1264-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 @@ -2134,15 +2134,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: s_mul_i32 s5, s5, 5 ; GFX1232-NEXT: s_mov_b32 s10, -1 ; GFX1232-NEXT: v_mov_b32_e32 v1, s5 -; GFX1232-NEXT: s_waitcnt lgkmcnt(0) +; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s8, s2 ; GFX1232-NEXT: s_mov_b32 s9, s3 ; GFX1232-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1232-NEXT: s_waitcnt vmcnt(0) +; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB6_2: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1232-NEXT: s_waitcnt lgkmcnt(0) +; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1232-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 @@ -2430,18 +2430,18 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: ; %bb.1: ; GFX1264-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1264-NEXT: s_mov_b32 s15, 0x31016000 -; GFX1264-NEXT: s_waitcnt lgkmcnt(0) +; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mul_i32 s2, s8, s2 ; GFX1264-NEXT: s_mov_b32 s14, -1 ; GFX1264-NEXT: v_mov_b32_e32 v1, s2 ; GFX1264-NEXT: s_mov_b32 s12, s6 ; GFX1264-NEXT: s_mov_b32 s13, s7 ; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[12:15], null th:TH_ATOMIC_RETURN -; GFX1264-NEXT: s_waitcnt vmcnt(0) +; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB7_2: ; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1264-NEXT: s_waitcnt lgkmcnt(0) +; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX1264-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 @@ -2468,18 +2468,18 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: ; %bb.1: ; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1232-NEXT: s_waitcnt lgkmcnt(0) +; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mul_i32 s2, s0, s2 ; GFX1232-NEXT: s_mov_b32 s10, -1 ; GFX1232-NEXT: v_mov_b32_e32 v1, s2 ; GFX1232-NEXT: s_mov_b32 s8, s6 ; GFX1232-NEXT: s_mov_b32 s9, s7 ; GFX1232-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1232-NEXT: s_waitcnt vmcnt(0) +; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB7_2: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1232-NEXT: s_waitcnt lgkmcnt(0) +; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX1232-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 @@ -2827,15 +2827,15 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: v_mov_b32_e32 v0, s6 ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264-NEXT: s_mov_b32 s10, -1 -; GFX1264-NEXT: s_waitcnt lgkmcnt(0) +; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mov_b32 s8, s2 ; GFX1264-NEXT: s_mov_b32 s9, s3 ; GFX1264-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1264-NEXT: s_waitcnt vmcnt(0) +; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB8_4: ; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1264-NEXT: s_waitcnt lgkmcnt(0) +; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2875,15 +2875,15 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: v_mov_b32_e32 v0, s4 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232-NEXT: s_mov_b32 s10, -1 -; GFX1232-NEXT: s_waitcnt lgkmcnt(0) +; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s8, s2 ; GFX1232-NEXT: s_mov_b32 s9, s3 ; GFX1232-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1232-NEXT: s_waitcnt vmcnt(0) +; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB8_4: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX1232-NEXT: s_waitcnt lgkmcnt(0) +; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3194,15 +3194,15 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: s_mov_b32 s10, -1 ; GFX1264-NEXT: v_mov_b32_e32 v0, s6 ; GFX1264-NEXT: v_mov_b32_e32 v1, s7 -; GFX1264-NEXT: s_waitcnt lgkmcnt(0) +; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mov_b32 s8, s2 ; GFX1264-NEXT: s_mov_b32 s9, s3 ; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1264-NEXT: s_waitcnt vmcnt(0) +; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB9_2: ; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1264-NEXT: s_waitcnt lgkmcnt(0) +; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1264-NEXT: v_readfirstlane_b32 s3, v1 @@ -3234,15 +3234,15 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5 ; GFX1232-NEXT: s_mov_b32 s10, -1 ; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX1232-NEXT: s_waitcnt lgkmcnt(0) +; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s8, s2 ; GFX1232-NEXT: s_mov_b32 s9, s3 ; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1232-NEXT: s_waitcnt vmcnt(0) +; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB9_2: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; GFX1232-NEXT: s_waitcnt lgkmcnt(0) +; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1232-NEXT: v_readfirstlane_b32 s3, v1 @@ -3594,7 +3594,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_cbranch_execz .LBB10_2 ; GFX1264-NEXT: ; %bb.1: ; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[8:9] -; GFX1264-NEXT: s_waitcnt lgkmcnt(0) +; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mul_u64 s[8:9], s[0:1], s[10:11] ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264-NEXT: v_mov_b32_e32 v0, s8 @@ -3603,11 +3603,11 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_mov_b32 s8, s6 ; GFX1264-NEXT: s_mov_b32 s9, s7 ; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1264-NEXT: s_waitcnt vmcnt(0) +; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB10_2: ; GFX1264-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1264-NEXT: s_waitcnt lgkmcnt(0) +; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_mul_lo_u32 v5, s1, v2 ; GFX1264-NEXT: v_mad_co_u64_u32 v[3:4], null, s0, v2, 0 ; GFX1264-NEXT: v_readfirstlane_b32 s0, v0 @@ -3640,18 +3640,18 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: ; %bb.1: ; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1232-NEXT: s_mov_b32 s15, 0x31016000 -; GFX1232-NEXT: s_waitcnt lgkmcnt(0) +; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mul_u64 s[2:3], s[0:1], s[2:3] ; GFX1232-NEXT: s_mov_b32 s14, -1 ; GFX1232-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX1232-NEXT: s_mov_b32 s12, s6 ; GFX1232-NEXT: s_mov_b32 s13, s7 ; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN -; GFX1232-NEXT: s_waitcnt vmcnt(0) +; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB10_2: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; GFX1232-NEXT: s_waitcnt lgkmcnt(0) +; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_mul_lo_u32 v5, s1, v2 ; GFX1232-NEXT: v_mad_co_u64_u32 v[3:4], null, s0, v2, 0 ; GFX1232-NEXT: v_readfirstlane_b32 s0, v0 @@ -3762,12 +3762,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s11, s7 ; GFX12-NEXT: s_mov_b32 s10, s6 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s8, s2 ; GFX12-NEXT: s_mov_b32 s9, s3 ; GFX12-NEXT: s_mov_b32 s4, s0 ; GFX12-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_mov_b32 s5, s1 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll index 79f8b3a1d5d84c..57a0b1b3d2b13e 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -233,17 +233,17 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB0_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W64-NEXT: s_waitcnt vmcnt(0) +; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -264,17 +264,17 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12W32-NEXT: s_mul_i32 s3, s3, 5 ; GFX12W32-NEXT: v_mov_b32_e32 v1, s3 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB0_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W32-NEXT: s_waitcnt vmcnt(0) +; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -511,7 +511,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: s_mul_i32 s4, s6, s4 ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 @@ -519,9 +519,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: .LBB1_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0 -; GFX12W64-NEXT: s_waitcnt vmcnt(0) +; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -544,7 +544,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 @@ -552,9 +552,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: .LBB1_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0 -; GFX12W32-NEXT: s_waitcnt vmcnt(0) +; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 @@ -857,17 +857,17 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB2_4: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W64-NEXT: s_waitcnt vmcnt(0) +; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -900,16 +900,16 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; %bb.3: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB2_4: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W32-NEXT: s_waitcnt vmcnt(0) +; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -991,10 +991,10 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v1, 1 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1231,18 +1231,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB4_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W64-NEXT: s_waitcnt vmcnt(0) +; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1263,18 +1263,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12W32-NEXT: s_mul_i32 s3, s3, 5 ; GFX12W32-NEXT: v_mov_b32_e32 v1, s3 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB4_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W32-NEXT: s_waitcnt vmcnt(0) +; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1515,7 +1515,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: s_mul_i32 s4, s6, s4 ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 @@ -1523,9 +1523,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: .LBB5_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0 -; GFX12W64-NEXT: s_waitcnt vmcnt(0) +; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -1548,7 +1548,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 @@ -1556,9 +1556,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: .LBB5_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0 -; GFX12W32-NEXT: s_waitcnt vmcnt(0) +; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -1863,17 +1863,17 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB6_4: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W64-NEXT: s_waitcnt vmcnt(0) +; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1906,17 +1906,17 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; %bb.3: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB6_4: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W32-NEXT: s_waitcnt vmcnt(0) +; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1998,10 +1998,10 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v1, 1 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll index edf6fbadf1a60a..f0d2e36c487f1f 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -241,17 +241,17 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB0_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W64-NEXT: s_waitcnt vmcnt(0) +; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -272,17 +272,17 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12W32-NEXT: s_mul_i32 s3, s3, 5 ; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB0_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W32-NEXT: s_waitcnt vmcnt(0) +; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -527,7 +527,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: s_mul_i32 s4, s6, s4 ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 @@ -535,9 +535,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: .LBB1_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0 -; GFX12W64-NEXT: s_waitcnt vmcnt(0) +; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -560,7 +560,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 @@ -568,9 +568,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: .LBB1_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0 -; GFX12W32-NEXT: s_waitcnt vmcnt(0) +; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 @@ -881,17 +881,17 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB2_4: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W64-NEXT: s_waitcnt vmcnt(0) +; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -925,16 +925,16 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB2_4: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W32-NEXT: s_waitcnt vmcnt(0) +; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1016,10 +1016,10 @@ define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v1, 1 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1136,9 +1136,9 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: v_mov_b32_e32 v2, 1 ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: s_waitcnt vmcnt(0) +; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1150,9 +1150,9 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX12W32-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0 ; GFX12W32-NEXT: v_mov_b32_e32 v2, 1 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: s_waitcnt vmcnt(0) +; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1397,18 +1397,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB5_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W64-NEXT: s_waitcnt vmcnt(0) +; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1429,18 +1429,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12W32-NEXT: s_mul_i32 s3, s3, 5 ; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB5_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W32-NEXT: s_waitcnt vmcnt(0) +; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1689,7 +1689,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: s_mul_i32 s4, s6, s4 ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 @@ -1697,9 +1697,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: .LBB6_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0 -; GFX12W64-NEXT: s_waitcnt vmcnt(0) +; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -1722,7 +1722,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 @@ -1730,9 +1730,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: .LBB6_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0 -; GFX12W32-NEXT: s_waitcnt vmcnt(0) +; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -2045,17 +2045,17 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v0, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB7_4: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W64-NEXT: s_waitcnt vmcnt(0) +; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2089,17 +2089,17 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v0, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB7_4: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W32-NEXT: s_waitcnt vmcnt(0) +; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2181,10 +2181,10 @@ define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v1, 1 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2301,9 +2301,9 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: v_mov_b32_e32 v2, 1 ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: s_waitcnt vmcnt(0) +; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2315,9 +2315,9 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX12W32-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0 ; GFX12W32-NEXT: v_mov_b32_e32 v2, 1 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: s_waitcnt vmcnt(0) +; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll index e18bdc89e7d421..d0ba606052a604 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll @@ -89,18 +89,22 @@ define float @syncscope_system(ptr %addr, float %val) #0 { ; ; GFX1200-LABEL: syncscope_system: ; GFX1200: ; %bb.0: -; GFX1200-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-NEXT: s_wait_expcnt 0x0 +; GFX1200-NEXT: s_wait_samplecnt 0x0 +; GFX1200-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 ; GFX1200-NEXT: flat_load_b32 v3, v[0:1] ; GFX1200-NEXT: s_mov_b32 s0, 0 ; GFX1200-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1200-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1200-NEXT: v_mov_b32_e32 v4, v3 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX1200-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1200-NEXT: s_wait_storecnt 0x0 ; GFX1200-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN -; GFX1200-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1200-NEXT: global_inv scope:SCOPE_SYS ; GFX1200-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX1200-NEXT: s_or_b32 s0, vcc_lo, s0 @@ -204,10 +208,14 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 { ; ; GFX1200-LABEL: syncscope_workgroup_rtn: ; GFX1200: ; %bb.0: -; GFX1200-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1200-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-NEXT: s_wait_expcnt 0x0 +; GFX1200-NEXT: s_wait_samplecnt 0x0 +; GFX1200-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: s_wait_storecnt 0x0 ; GFX1200-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN -; GFX1200-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1200-NEXT: global_inv scope:SCOPE_SE ; GFX1200-NEXT: s_setpc_b64 s[30:31] %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst @@ -334,11 +342,14 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 { ; ; GFX1200-LABEL: syncscope_workgroup_nortn: ; GFX1200: ; %bb.0: -; GFX1200-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1200-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-NEXT: s_wait_expcnt 0x0 +; GFX1200-NEXT: s_wait_samplecnt 0x0 +; GFX1200-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: s_wait_storecnt 0x0 ; GFX1200-NEXT: flat_atomic_add_f32 v[0:1], v2 -; GFX1200-NEXT: s_waitcnt lgkmcnt(0) -; GFX1200-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1200-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1200-NEXT: global_inv scope:SCOPE_SE ; GFX1200-NEXT: s_setpc_b64 s[30:31] %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst @@ -422,18 +433,22 @@ define float @no_unsafe(ptr %addr, float %val) { ; ; GFX1200-LABEL: no_unsafe: ; GFX1200: ; %bb.0: -; GFX1200-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-NEXT: s_wait_expcnt 0x0 +; GFX1200-NEXT: s_wait_samplecnt 0x0 +; GFX1200-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 ; GFX1200-NEXT: flat_load_b32 v3, v[0:1] ; GFX1200-NEXT: s_mov_b32 s0, 0 ; GFX1200-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1200-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1200-NEXT: v_mov_b32_e32 v4, v3 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX1200-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1200-NEXT: s_wait_storecnt 0x0 ; GFX1200-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN -; GFX1200-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1200-NEXT: global_inv scope:SCOPE_SE ; GFX1200-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX1200-NEXT: s_or_b32 s0, vcc_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll index f5846c3d6db737..860bacf19f39dd 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll @@ -34,7 +34,7 @@ define amdgpu_cs void @test_sink_smem_offset_400(ptr addrspace(4) inreg %ptr, i3 ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: .LBB0_1: ; %loop ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x190 ; GFX12-NEXT: s_add_co_i32 s2, s2, -1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -100,7 +100,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000(ptr addrspace(4) inreg %ptr, i ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: .LBB1_1: ; %loop ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s3, s[0:1], 0xfa0 ; GFX12-NEXT: s_add_co_i32 s2, s2, -1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -154,7 +154,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000000(ptr addrspace(4) inreg %ptr ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: .LBB2_1: ; %loop ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x3d0900 ; GFX12-NEXT: s_add_co_i32 s2, s2, -1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -209,7 +209,7 @@ define amdgpu_cs void @test_sink_smem_offset_40000000(ptr addrspace(4) inreg %pt ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x2625a00 ; GFX12-NEXT: .LBB3_1: ; %loop ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX12-NEXT: s_add_co_i32 s2, s2, -1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -255,7 +255,7 @@ define amdgpu_cs void @test_sink_smem_offset_40000000000(ptr addrspace(4) inreg ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: .LBB4_1: ; %loop ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX12-NEXT: s_add_co_i32 s2, s2, -1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -309,7 +309,7 @@ define amdgpu_cs void @test_sink_smem_offset_neg400(ptr addrspace(4) inreg %ptr, ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: .LBB5_1: ; %loop ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s3, s[0:1], -0x190 ; GFX12-NEXT: s_add_co_i32 s2, s2, -1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll index b95231fd8880f5..f2e28cc8e1bcd2 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp.ll @@ -67,9 +67,9 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -148,9 +148,9 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace( ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, -v1, -v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -230,9 +230,9 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, -|v1|, -|v1| clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -322,9 +322,9 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v1, 0.5, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_maxmin_num_f32 v1, v1, 0x80000000, 1.0 @@ -416,9 +416,9 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_maxmin_num_f32 v1, v1, 0x80000000, 1.0 @@ -517,16 +517,16 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v1, 0, v1 ; GFX12-NEXT: v_min_num_f32_e32 v2, 1.0, v1 ; GFX12-NEXT: global_store_b32 v0, v2, s[0:1] -; GFX12-NEXT: global_store_b32 v[0:1], v1, off th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: global_store_b32 v[0:1], v1, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -605,9 +605,9 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f16_e64 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -687,9 +687,9 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f16_e64 v1, -v1, -v1 clamp ; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -770,9 +770,9 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f16_e64 v1, -|v1|, -|v1| clamp ; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -854,9 +854,9 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] clamp ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 @@ -935,9 +935,9 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace( ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1017,9 +1017,9 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1104,9 +1104,9 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_med3_num_f32 v1, 0x80000000, 1.0, v1 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1183,9 +1183,9 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1262,9 +1262,9 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1341,9 +1341,9 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1420,9 +1420,9 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1499,9 +1499,9 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1578,9 +1578,9 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1643,7 +1643,7 @@ define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) # ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 1.0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1702,7 +1702,7 @@ define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out) ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1762,7 +1762,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out) ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0.5 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1822,7 +1822,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1) ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0x7fffff :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1881,7 +1881,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 { ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1940,7 +1940,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 { ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2023,9 +2023,9 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f32_e64 v1, v1, 0.5 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -2105,9 +2105,9 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f32_e64 v1, v1, 0.5 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -2192,9 +2192,9 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -2278,9 +2278,9 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace( ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -2360,9 +2360,9 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) % ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -2439,9 +2439,9 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) % ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -2518,9 +2518,9 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) % ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -2597,9 +2597,9 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) % ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -2676,9 +2676,9 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) % ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -2755,9 +2755,9 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) % ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -2820,7 +2820,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2880,7 +2880,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2962,9 +2962,9 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -3062,9 +3062,9 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -3161,9 +3161,9 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, 2.0 @@ -3262,9 +3262,9 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, 0 @@ -3355,9 +3355,9 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -3449,9 +3449,9 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp @@ -3544,9 +3544,9 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 neg_lo:[1,1] clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -3636,9 +3636,9 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 neg_hi:[1,1] clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -3728,9 +3728,9 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -3829,9 +3829,9 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -3929,9 +3929,9 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -4023,9 +4023,9 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr ad ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b96 s[4:6], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_f32 s2, s4, s5 ; GFX12-NEXT: s_add_f32 s3, s4, s6 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3) diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll index a8dcb74c7d5f92..4bfaa6e90bdfee 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -51,9 +51,13 @@ define i32 @global_load_2xi16_align2(ptr addrspace(1) %p) #0 { ; ; GFX12-LABEL: global_load_2xi16_align2: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v0, v[0:1], off -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, ptr addrspace(1) %p, i64 1 %p.0 = load i16, ptr addrspace(1) %p, align 2 @@ -125,7 +129,7 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -193,9 +197,13 @@ define i32 @global_load_2xi16_align1(ptr addrspace(1) %p) #0 { ; ; GFX12-LABEL: global_load_2xi16_align1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v0, v[0:1], off -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, ptr addrspace(1) %p, i64 1 %p.0 = load i16, ptr addrspace(1) %p, align 1 @@ -278,7 +286,7 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -328,9 +336,13 @@ define i32 @global_load_2xi16_align4(ptr addrspace(1) %p) #0 { ; ; GFX12-LABEL: global_load_2xi16_align4: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v0, v[0:1], off -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, ptr addrspace(1) %p, i64 1 %p.0 = load i16, ptr addrspace(1) %p, align 4 @@ -396,7 +408,7 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll index 61f0e5e2a88928..f9694dcd89abfb 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll @@ -75,16 +75,24 @@ define i32 @private_load_2xi16_align2(ptr addrspace(5) %p) #0 { ; ; GFX12-LABEL: private_load_2xi16_align2: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: scratch_load_b32 v0, v0, off -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FLASTSCR-LABEL: private_load_2xi16_align2: ; GFX12-FLASTSCR: ; %bb.0: -; GFX12-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-FLASTSCR-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FLASTSCR-NEXT: s_wait_expcnt 0x0 +; GFX12-FLASTSCR-NEXT: s_wait_samplecnt 0x0 +; GFX12-FLASTSCR-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FLASTSCR-NEXT: s_wait_kmcnt 0x0 ; GFX12-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off -; GFX12-FLASTSCR-NEXT: s_waitcnt vmcnt(0) +; GFX12-FLASTSCR-NEXT: s_wait_loadcnt 0x0 ; GFX12-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1 %p.0 = load i16, ptr addrspace(5) %p, align 2 @@ -163,14 +171,22 @@ define void @private_store_2xi16_align2(ptr addrspace(5) %p, ptr addrspace(5) %r ; ; GFX12-LABEL: private_store_2xi16_align2: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX12-NEXT: scratch_store_b32 v1, v0, off ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FLASTSCR-LABEL: private_store_2xi16_align2: ; GFX12-FLASTSCR: ; %bb.0: -; GFX12-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-FLASTSCR-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FLASTSCR-NEXT: s_wait_expcnt 0x0 +; GFX12-FLASTSCR-NEXT: s_wait_samplecnt 0x0 +; GFX12-FLASTSCR-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FLASTSCR-NEXT: s_wait_kmcnt 0x0 ; GFX12-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX12-FLASTSCR-NEXT: scratch_store_b32 v1, v0, off ; GFX12-FLASTSCR-NEXT: s_setpc_b64 s[30:31] @@ -255,16 +271,24 @@ define i32 @private_load_2xi16_align1(ptr addrspace(5) %p) #0 { ; ; GFX12-LABEL: private_load_2xi16_align1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: scratch_load_b32 v0, v0, off -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FLASTSCR-LABEL: private_load_2xi16_align1: ; GFX12-FLASTSCR: ; %bb.0: -; GFX12-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-FLASTSCR-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FLASTSCR-NEXT: s_wait_expcnt 0x0 +; GFX12-FLASTSCR-NEXT: s_wait_samplecnt 0x0 +; GFX12-FLASTSCR-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FLASTSCR-NEXT: s_wait_kmcnt 0x0 ; GFX12-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off -; GFX12-FLASTSCR-NEXT: s_waitcnt vmcnt(0) +; GFX12-FLASTSCR-NEXT: s_wait_loadcnt 0x0 ; GFX12-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1 %p.0 = load i16, ptr addrspace(5) %p, align 1 @@ -348,14 +372,22 @@ define void @private_store_2xi16_align1(ptr addrspace(5) %p, ptr addrspace(5) %r ; ; GFX12-LABEL: private_store_2xi16_align1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX12-NEXT: scratch_store_b32 v1, v0, off ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FLASTSCR-LABEL: private_store_2xi16_align1: ; GFX12-FLASTSCR: ; %bb.0: -; GFX12-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-FLASTSCR-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FLASTSCR-NEXT: s_wait_expcnt 0x0 +; GFX12-FLASTSCR-NEXT: s_wait_samplecnt 0x0 +; GFX12-FLASTSCR-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FLASTSCR-NEXT: s_wait_kmcnt 0x0 ; GFX12-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX12-FLASTSCR-NEXT: scratch_store_b32 v1, v0, off ; GFX12-FLASTSCR-NEXT: s_setpc_b64 s[30:31] @@ -425,16 +457,24 @@ define i32 @private_load_2xi16_align4(ptr addrspace(5) %p) #0 { ; ; GFX12-LABEL: private_load_2xi16_align4: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: scratch_load_b32 v0, v0, off -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FLASTSCR-LABEL: private_load_2xi16_align4: ; GFX12-FLASTSCR: ; %bb.0: -; GFX12-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-FLASTSCR-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FLASTSCR-NEXT: s_wait_expcnt 0x0 +; GFX12-FLASTSCR-NEXT: s_wait_samplecnt 0x0 +; GFX12-FLASTSCR-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FLASTSCR-NEXT: s_wait_kmcnt 0x0 ; GFX12-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off -; GFX12-FLASTSCR-NEXT: s_waitcnt vmcnt(0) +; GFX12-FLASTSCR-NEXT: s_wait_loadcnt 0x0 ; GFX12-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1 %p.0 = load i16, ptr addrspace(5) %p, align 4 @@ -510,14 +550,22 @@ define void @private_store_2xi16_align4(ptr addrspace(5) %p, ptr addrspace(5) %r ; ; GFX12-LABEL: private_store_2xi16_align4: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX12-NEXT: scratch_store_b32 v1, v0, off ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FLASTSCR-LABEL: private_store_2xi16_align4: ; GFX12-FLASTSCR: ; %bb.0: -; GFX12-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-FLASTSCR-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FLASTSCR-NEXT: s_wait_expcnt 0x0 +; GFX12-FLASTSCR-NEXT: s_wait_samplecnt 0x0 +; GFX12-FLASTSCR-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FLASTSCR-NEXT: s_wait_kmcnt 0x0 ; GFX12-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX12-FLASTSCR-NEXT: scratch_store_b32 v1, v0, off ; GFX12-FLASTSCR-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll index 93e8630dc7f560..4d432fbcfef973 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll @@ -27,7 +27,7 @@ define amdgpu_ps void @test_scratch_load_i8_zext_v(ptr addrspace(5) %in, ptr %ou ; GFX12-LABEL: test_scratch_load_i8_zext_v: ; GFX12: ; %bb.0: ; GFX12-NEXT: scratch_load_u8 v0, v0, off offset:1 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b32 v[1:2], v0 ; GFX12-NEXT: s_endpgm %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1 @@ -59,7 +59,7 @@ define amdgpu_ps void @test_scratch_load_i8_sext_v(ptr addrspace(5) %in, ptr %ou ; GFX12-LABEL: test_scratch_load_i8_sext_v: ; GFX12: ; %bb.0: ; GFX12-NEXT: scratch_load_i8 v0, v0, off offset:1 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b32 v[1:2], v0 ; GFX12-NEXT: s_endpgm %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1 @@ -91,7 +91,7 @@ define amdgpu_ps void @test_scratch_load_i16_zext_v(ptr addrspace(5) %in, ptr %o ; GFX12-LABEL: test_scratch_load_i16_zext_v: ; GFX12: ; %bb.0: ; GFX12-NEXT: scratch_load_u16 v0, v0, off offset:2 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b32 v[1:2], v0 ; GFX12-NEXT: s_endpgm %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1 @@ -123,7 +123,7 @@ define amdgpu_ps void @test_scratch_load_i16_sext_v(ptr addrspace(5) %in, ptr %o ; GFX12-LABEL: test_scratch_load_i16_sext_v: ; GFX12: ; %bb.0: ; GFX12-NEXT: scratch_load_i16 v0, v0, off offset:2 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b32 v[1:2], v0 ; GFX12-NEXT: s_endpgm %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1 @@ -159,7 +159,7 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_v(ptr addrspace(5) %i ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_mov_b32_e32 v3, 0xffff0000 ; GFX12-NEXT: scratch_load_d16_u8 v3, v0, off offset:1 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b32 v[1:2], v3 ; GFX12-NEXT: s_endpgm bb: @@ -197,7 +197,7 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_v(ptr addrspace(5) %i ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_mov_b32_e32 v3, 0xffff0000 ; GFX12-NEXT: scratch_load_d16_i8 v3, v0, off offset:1 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b32 v[1:2], v3 ; GFX12-NEXT: s_endpgm bb: @@ -235,7 +235,7 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_v(ptr addrspace(5) %in, p ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_mov_b32_e32 v3, 0xffff0000 ; GFX12-NEXT: scratch_load_d16_b16 v3, v0, off offset:2 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b32 v[1:2], v3 ; GFX12-NEXT: s_endpgm bb: @@ -273,7 +273,7 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_v(ptr addrspace(5) %i ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_mov_b32_e32 v3, -1 ; GFX12-NEXT: scratch_load_d16_hi_u8 v3, v0, off offset:1 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b32 v[1:2], v3 ; GFX12-NEXT: s_endpgm bb: @@ -311,7 +311,7 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_v(ptr addrspace(5) %i ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_mov_b32_e32 v3, -1 ; GFX12-NEXT: scratch_load_d16_hi_i8 v3, v0, off offset:1 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b32 v[1:2], v3 ; GFX12-NEXT: s_endpgm bb: @@ -349,7 +349,7 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_v(ptr addrspace(5) %in, p ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_mov_b32_e32 v3, -1 ; GFX12-NEXT: scratch_load_d16_hi_b16 v3, v0, off offset:2 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b32 v[1:2], v3 ; GFX12-NEXT: s_endpgm bb: @@ -384,7 +384,7 @@ define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_v(ptr %in, ptr addrspac ; GFX12-LABEL: test_scratch_store_b8_from_d16_hi_v: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: flat_load_b32 v0, v[0:1] -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: scratch_store_d16_hi_b8 v2, v0, off offset:4 ; GFX12-NEXT: s_endpgm bb: @@ -419,7 +419,7 @@ define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_v(ptr %in, ptr addrspa ; GFX12-LABEL: test_scratch_store_b16_from_d16_hi_v: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: flat_load_b32 v0, v[0:1] -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: scratch_store_d16_hi_b16 v2, v0, off offset:2 ; GFX12-NEXT: s_endpgm bb: @@ -457,7 +457,7 @@ define amdgpu_ps void @test_scratch_load_i8_zext_s(ptr addrspace(5) inreg %in, p ; GFX12-LABEL: test_scratch_load_i8_zext_s: ; GFX12: ; %bb.0: ; GFX12-NEXT: scratch_load_u8 v2, off, s0 offset:1 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1 @@ -489,7 +489,7 @@ define amdgpu_ps void @test_scratch_load_i8_sext_s(ptr addrspace(5) inreg %in, p ; GFX12-LABEL: test_scratch_load_i8_sext_s: ; GFX12: ; %bb.0: ; GFX12-NEXT: scratch_load_i8 v2, off, s0 offset:1 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1 @@ -521,7 +521,7 @@ define amdgpu_ps void @test_scratch_load_i16_zext_s(ptr addrspace(5) inreg %in, ; GFX12-LABEL: test_scratch_load_i16_zext_s: ; GFX12: ; %bb.0: ; GFX12-NEXT: scratch_load_u16 v2, off, s0 offset:2 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1 @@ -553,7 +553,7 @@ define amdgpu_ps void @test_scratch_load_i16_sext_s(ptr addrspace(5) inreg %in, ; GFX12-LABEL: test_scratch_load_i16_sext_s: ; GFX12: ; %bb.0: ; GFX12-NEXT: scratch_load_i16 v2, off, s0 offset:2 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1 @@ -590,7 +590,7 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_s(ptr addrspace(5) in ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_mov_b32_e32 v2, 0xffff0000 ; GFX12-NEXT: scratch_load_d16_u8 v2, off, s0 offset:1 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm bb: @@ -629,7 +629,7 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_s(ptr addrspace(5) in ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_mov_b32_e32 v2, 0xffff0000 ; GFX12-NEXT: scratch_load_d16_i8 v2, off, s0 offset:1 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm bb: @@ -668,7 +668,7 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_s(ptr addrspace(5) inreg ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_mov_b32_e32 v2, 0xffff0000 ; GFX12-NEXT: scratch_load_d16_b16 v2, off, s0 offset:2 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm bb: @@ -707,7 +707,7 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_s(ptr addrspace(5) in ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_mov_b32_e32 v2, -1 ; GFX12-NEXT: scratch_load_d16_hi_u8 v2, off, s0 offset:1 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm bb: @@ -746,7 +746,7 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_s(ptr addrspace(5) in ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_mov_b32_e32 v2, -1 ; GFX12-NEXT: scratch_load_d16_hi_i8 v2, off, s0 offset:1 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm bb: @@ -785,7 +785,7 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_s(ptr addrspace(5) inreg ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_mov_b32_e32 v2, -1 ; GFX12-NEXT: scratch_load_d16_hi_b16 v2, off, s0 offset:2 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm bb: @@ -820,7 +820,7 @@ define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_s(ptr %in, ptr addrspac ; GFX12-LABEL: test_scratch_store_b8_from_d16_hi_s: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: flat_load_b32 v0, v[0:1] -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: scratch_store_d16_hi_b8 off, v0, s0 offset:4 ; GFX12-NEXT: s_endpgm bb: @@ -855,7 +855,7 @@ define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_s(ptr %in, ptr addrspa ; GFX12-LABEL: test_scratch_store_b16_from_d16_hi_s: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: flat_load_b32 v0, v[0:1] -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: scratch_store_d16_hi_b16 off, v0, s0 offset:2 ; GFX12-NEXT: s_endpgm bb: @@ -896,7 +896,7 @@ define amdgpu_ps void @test_scratch_load_i8_zext_svs(ptr addrspace(5) inreg %in, ; GFX12: ; %bb.0: ; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX12-NEXT: scratch_load_u8 v0, v0, off offset:1 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b32 v[1:2], v0 ; GFX12-NEXT: s_endpgm %voffset4 = mul i32 %voffset, 4 @@ -933,7 +933,7 @@ define amdgpu_ps void @test_scratch_load_i8_sext_svs(ptr addrspace(5) inreg %in, ; GFX12: ; %bb.0: ; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX12-NEXT: scratch_load_i8 v0, v0, off offset:1 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b32 v[1:2], v0 ; GFX12-NEXT: s_endpgm %voffset4 = mul i32 %voffset, 4 @@ -970,7 +970,7 @@ define amdgpu_ps void @test_scratch_load_i16_zext_svs(ptr addrspace(5) inreg %in ; GFX12: ; %bb.0: ; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX12-NEXT: scratch_load_u16 v0, v0, off offset:2 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b32 v[1:2], v0 ; GFX12-NEXT: s_endpgm %voffset4 = mul i32 %voffset, 4 @@ -1007,7 +1007,7 @@ define amdgpu_ps void @test_scratch_load_i16_sext_svs(ptr addrspace(5) inreg %in ; GFX12: ; %bb.0: ; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX12-NEXT: scratch_load_i16 v0, v0, off offset:2 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b32 v[1:2], v0 ; GFX12-NEXT: s_endpgm %voffset4 = mul i32 %voffset, 4 @@ -1049,7 +1049,7 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_svs(ptr addrspace(5) ; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX12-NEXT: v_mov_b32_e32 v3, 0xffff0000 ; GFX12-NEXT: scratch_load_d16_u8 v3, v0, off offset:1 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b32 v[1:2], v3 ; GFX12-NEXT: s_endpgm bb: @@ -1093,7 +1093,7 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_svs(ptr addrspace(5) ; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX12-NEXT: v_mov_b32_e32 v3, 0xffff0000 ; GFX12-NEXT: scratch_load_d16_i8 v3, v0, off offset:1 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b32 v[1:2], v3 ; GFX12-NEXT: s_endpgm bb: @@ -1137,7 +1137,7 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_svs(ptr addrspace(5) inre ; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX12-NEXT: v_mov_b32_e32 v3, 0xffff0000 ; GFX12-NEXT: scratch_load_d16_b16 v3, v0, off offset:2 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b32 v[1:2], v3 ; GFX12-NEXT: s_endpgm bb: @@ -1181,7 +1181,7 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_svs(ptr addrspace(5) ; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX12-NEXT: v_mov_b32_e32 v3, -1 ; GFX12-NEXT: scratch_load_d16_hi_u8 v3, v0, off offset:1 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b32 v[1:2], v3 ; GFX12-NEXT: s_endpgm bb: @@ -1225,7 +1225,7 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_svs(ptr addrspace(5) ; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX12-NEXT: v_mov_b32_e32 v3, -1 ; GFX12-NEXT: scratch_load_d16_hi_i8 v3, v0, off offset:1 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b32 v[1:2], v3 ; GFX12-NEXT: s_endpgm bb: @@ -1269,7 +1269,7 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_svs(ptr addrspace(5) inre ; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX12-NEXT: v_mov_b32_e32 v3, -1 ; GFX12-NEXT: scratch_load_d16_hi_b16 v3, v0, off offset:2 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b32 v[1:2], v3 ; GFX12-NEXT: s_endpgm bb: @@ -1310,7 +1310,7 @@ define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_svs(ptr %in, ptr addrsp ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: flat_load_b32 v0, v[0:1] ; GFX12-NEXT: v_lshl_add_u32 v1, v2, 2, s0 -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: scratch_store_d16_hi_b8 v1, v0, off offset:4 ; GFX12-NEXT: s_endpgm bb: @@ -1351,7 +1351,7 @@ define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_svs(ptr %in, ptr addrs ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: flat_load_b32 v0, v[0:1] ; GFX12-NEXT: v_lshl_add_u32 v1, v2, 2, s0 -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: scratch_store_d16_hi_b16 v1, v0, off offset:2 ; GFX12-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll index 04eb6dcff4632b..2fd0367f599068 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -94,14 +94,14 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 4 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT -; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT -; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT -; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: soff1_voff1: @@ -109,14 +109,14 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 4 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT -; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT -; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT -; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_endpgm bb: %soff1 = mul i32 %soff, 1 @@ -220,15 +220,15 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT -; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT -; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT -; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: soff1_voff2: @@ -236,15 +236,15 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT -; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT -; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT -; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_endpgm bb: %soff1 = mul i32 %soff, 1 @@ -348,15 +348,15 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT -; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT -; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT -; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: soff1_voff4: @@ -364,15 +364,15 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT -; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT -; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT -; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_endpgm bb: %soff1 = mul i32 %soff, 1 @@ -477,16 +477,16 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 4 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT -; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT -; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT -; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: soff2_voff1: @@ -494,16 +494,16 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 4 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT -; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT -; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT -; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_endpgm bb: %soff2 = mul i32 %soff, 2 @@ -611,16 +611,16 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT -; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT -; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT -; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: soff2_voff2: @@ -628,16 +628,16 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT -; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT -; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT -; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_endpgm bb: %soff2 = mul i32 %soff, 2 @@ -745,16 +745,16 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT -; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT -; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT -; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: soff2_voff4: @@ -762,16 +762,16 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT -; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT -; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT -; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_endpgm bb: %soff2 = mul i32 %soff, 2 @@ -876,16 +876,16 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 4 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT -; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT -; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT -; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: soff4_voff1: @@ -893,16 +893,16 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 4 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT -; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT -; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT -; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_endpgm bb: %soff4 = mul i32 %soff, 4 @@ -1010,16 +1010,16 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT -; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT -; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT -; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: soff4_voff2: @@ -1027,16 +1027,16 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT -; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT -; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT -; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_endpgm bb: %soff4 = mul i32 %soff, 4 @@ -1142,16 +1142,16 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT -; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT -; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT -; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: soff4_voff4: @@ -1159,16 +1159,16 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 4, s0, v0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT -; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT -; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT -; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_endpgm bb: %soff4 = mul i32 %soff, 4 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index 98379f5e3c68b4..1b9f2a688a1901 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -259,7 +259,11 @@ define void @zero_init_foo() { ; ; GFX12-LABEL: zero_init_foo: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_mov_b32 s1, s0 @@ -344,7 +348,11 @@ define void @zero_init_foo() { ; ; GFX12-PAL-LABEL: zero_init_foo: ; GFX12-PAL: ; %bb.0: -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-PAL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-PAL-NEXT: s_wait_expcnt 0x0 +; GFX12-PAL-NEXT: s_wait_samplecnt 0x0 +; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: s_mov_b32 s0, 0 ; GFX12-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-PAL-NEXT: s_mov_b32 s1, s0 @@ -422,16 +430,16 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 15 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_and_b32 s1, s0, 15 ; GFX12-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-NEXT: s_lshl_b32 s1, s1, 2 ; GFX12-NEXT: s_add_co_i32 s0, s0, 4 ; GFX12-NEXT: s_add_co_i32 s1, s1, 4 -; GFX12-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_sindex_kernel: @@ -517,16 +525,16 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 -; GFX12-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-PAL-NEXT: s_lshl_b32 s1, s1, 2 ; GFX12-PAL-NEXT: s_add_co_i32 s0, s0, 4 ; GFX12-PAL-NEXT: s_add_co_i32 s1, s1, 4 -; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT -; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) @@ -595,10 +603,10 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX12-NEXT: s_lshl_b32 s1, s1, 2 ; GFX12-NEXT: s_add_co_i32 s0, s0, 4 ; GFX12-NEXT: s_add_co_i32 s1, s1, 4 -; GFX12-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_sindex_foo: @@ -681,10 +689,10 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX12-PAL-NEXT: s_lshl_b32 s1, s1, 2 ; GFX12-PAL-NEXT: s_add_co_i32 s0, s0, 4 ; GFX12-PAL-NEXT: s_add_co_i32 s1, s1, 4 -; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT -; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) @@ -743,10 +751,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_nc_u32_e32 v2, 4, v0 -; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:4 th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:124 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_vindex_kernel: @@ -816,10 +824,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-PAL-NEXT: v_sub_nc_u32_e32 v2, 4, v0 -; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:4 th:TH_STORE_NT_RT -; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:4 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) @@ -876,15 +884,19 @@ define void @store_load_vindex_foo(i32 %idx) { ; ; GFX12-LABEL: store_load_vindex_foo: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 ; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s32 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX12-NEXT: scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v1, s32 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b32 v0, v2, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_vindex_foo: @@ -943,15 +955,19 @@ define void @store_load_vindex_foo(i32 %idx) { ; ; GFX12-PAL-LABEL: store_load_vindex_foo: ; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-PAL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-PAL-NEXT: s_wait_expcnt 0x0 +; GFX12-PAL-NEXT: s_wait_samplecnt 0x0 +; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 ; GFX12-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s32 ; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX12-PAL-NEXT: scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT -; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-PAL-NEXT: scratch_load_b32 v0, v1, s32 th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_store_b32 v0, v2, off scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] bb: %i = alloca [32 x float], align 4, addrspace(5) @@ -988,7 +1004,11 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) { ; ; GFX12-LABEL: private_ptr_foo: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 0x41200000 ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:4 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1025,7 +1045,11 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) { ; ; GFX12-PAL-LABEL: private_ptr_foo: ; GFX12-PAL: ; %bb.0: -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-PAL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-PAL-NEXT: s_wait_expcnt 0x0 +; GFX12-PAL-NEXT: s_wait_samplecnt 0x0 +; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 ; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:4 ; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] @@ -1097,8 +1121,8 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; ; GFX12-LABEL: zero_init_small_offset_kernel: ; GFX12: ; %bb.0: -; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_mov_b32 s1, s0 @@ -1228,8 +1252,8 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; ; GFX12-PAL-LABEL: zero_init_small_offset_kernel: ; GFX12-PAL: ; %bb.0: -; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_mov_b32 s0, 0 ; GFX12-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-PAL-NEXT: s_mov_b32 s1, s0 @@ -1312,9 +1336,13 @@ define void @zero_init_small_offset_foo() { ; ; GFX12-LABEL: zero_init_small_offset_foo: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX12-NEXT: scratch_load_b32 v0, off, s32 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: scratch_load_b32 v0, off, s32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_mov_b32 s1, s0 @@ -1407,9 +1435,13 @@ define void @zero_init_small_offset_foo() { ; ; GFX12-PAL-LABEL: zero_init_small_offset_foo: ; GFX12-PAL: ; %bb.0: -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s32 th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-PAL-NEXT: s_wait_expcnt 0x0 +; GFX12-PAL-NEXT: s_wait_samplecnt 0x0 +; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s32 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_mov_b32 s0, 0 ; GFX12-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-PAL-NEXT: s_mov_b32 s1, s0 @@ -1495,19 +1527,19 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX12-LABEL: store_load_sindex_small_offset_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 15 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_and_b32 s1, s0, 15 ; GFX12-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-NEXT: s_lshl_b32 s1, s1, 2 ; GFX12-NEXT: s_addk_co_i32 s0, 0x104 ; GFX12-NEXT: s_addk_co_i32 s1, 0x104 -; GFX12-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel: @@ -1629,19 +1661,19 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX12-PAL-LABEL: store_load_sindex_small_offset_kernel: ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 -; GFX12-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-PAL-NEXT: s_lshl_b32 s1, s1, 2 ; GFX12-PAL-NEXT: s_addk_co_i32 s0, 0x104 ; GFX12-PAL-NEXT: s_addk_co_i32 s1, 0x104 -; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT -; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) @@ -1714,18 +1746,18 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { ; ; GFX12-LABEL: store_load_sindex_small_offset_foo: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 15 ; GFX12-NEXT: s_and_b32 s1, s0, 15 ; GFX12-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-NEXT: s_lshl_b32 s1, s1, 2 ; GFX12-NEXT: s_addk_co_i32 s0, 0x104 ; GFX12-NEXT: s_addk_co_i32 s1, 0x104 -; GFX12-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo: @@ -1837,18 +1869,18 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { ; ; GFX12-PAL-LABEL: store_load_sindex_small_offset_foo: ; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX12-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-PAL-NEXT: s_lshl_b32 s1, s1, 2 ; GFX12-PAL-NEXT: s_addk_co_i32 s0, 0x104 ; GFX12-PAL-NEXT: s_addk_co_i32 s1, 0x104 -; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT -; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) @@ -1914,13 +1946,13 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX12-LABEL: store_load_vindex_small_offset_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_nc_u32_e32 v2, 0x104, v0 -; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:260 th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:124 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:260 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel: @@ -2020,13 +2052,13 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX12-PAL-LABEL: store_load_vindex_small_offset_kernel: ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX12-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x104, v0 -; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:260 th:TH_STORE_NT_RT -; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:260 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) @@ -2095,17 +2127,21 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; ; GFX12-LABEL: store_load_vindex_small_offset_foo: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 ; GFX12-NEXT: s_add_co_i32 s0, s32, 0x100 -; GFX12-NEXT: scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_load_b32 v3, off, s32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX12-NEXT: scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v1, s32 offset:256 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b32 v0, v2, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b32 v0, v1, s32 offset:256 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo: @@ -2176,17 +2212,21 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; ; GFX12-PAL-LABEL: store_load_vindex_small_offset_foo: ; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-PAL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-PAL-NEXT: s_wait_expcnt 0x0 +; GFX12-PAL-NEXT: s_wait_samplecnt 0x0 +; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 ; GFX12-PAL-NEXT: s_add_co_i32 s0, s32, 0x100 -; GFX12-PAL-NEXT: scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_load_b32 v3, off, s32 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX12-PAL-NEXT: scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT -; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:256 th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_store_b32 v0, v2, off scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:256 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] bb: %padding = alloca [64 x i32], align 4, addrspace(5) @@ -2267,8 +2307,8 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; ; GFX12-LABEL: zero_init_large_offset_kernel: ; GFX12: ; %bb.0: -; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_mov_b32 s1, s0 @@ -2403,8 +2443,8 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; ; GFX12-PAL-LABEL: zero_init_large_offset_kernel: ; GFX12-PAL: ; %bb.0: -; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_mov_b32 s0, 0 ; GFX12-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-PAL-NEXT: s_mov_b32 s1, s0 @@ -2498,9 +2538,13 @@ define void @zero_init_large_offset_foo() { ; ; GFX12-LABEL: zero_init_large_offset_foo: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX12-NEXT: scratch_load_b32 v0, off, s32 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: scratch_load_b32 v0, off, s32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_mov_b32 s1, s0 @@ -2634,9 +2678,13 @@ define void @zero_init_large_offset_foo() { ; ; GFX12-PAL-LABEL: zero_init_large_offset_foo: ; GFX12-PAL: ; %bb.0: -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s32 th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-PAL-NEXT: s_wait_expcnt 0x0 +; GFX12-PAL-NEXT: s_wait_samplecnt 0x0 +; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s32 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_mov_b32 s0, 0 ; GFX12-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-PAL-NEXT: s_mov_b32 s1, s0 @@ -2722,19 +2770,19 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX12-LABEL: store_load_sindex_large_offset_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 15 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_and_b32 s1, s0, 15 ; GFX12-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-NEXT: s_lshl_b32 s1, s1, 2 ; GFX12-NEXT: s_addk_co_i32 s0, 0x4004 ; GFX12-NEXT: s_addk_co_i32 s1, 0x4004 -; GFX12-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel: @@ -2856,19 +2904,19 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX12-PAL-LABEL: store_load_sindex_large_offset_kernel: ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 -; GFX12-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-PAL-NEXT: s_lshl_b32 s1, s1, 2 ; GFX12-PAL-NEXT: s_addk_co_i32 s0, 0x4004 ; GFX12-PAL-NEXT: s_addk_co_i32 s1, 0x4004 -; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT -; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) @@ -2941,18 +2989,18 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { ; ; GFX12-LABEL: store_load_sindex_large_offset_foo: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 15 ; GFX12-NEXT: s_and_b32 s1, s0, 15 ; GFX12-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-NEXT: s_lshl_b32 s1, s1, 2 ; GFX12-NEXT: s_addk_co_i32 s0, 0x4004 ; GFX12-NEXT: s_addk_co_i32 s1, 0x4004 -; GFX12-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo: @@ -3064,18 +3112,18 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { ; ; GFX12-PAL-LABEL: store_load_sindex_large_offset_foo: ; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX12-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-PAL-NEXT: s_lshl_b32 s1, s1, 2 ; GFX12-PAL-NEXT: s_addk_co_i32 s0, 0x4004 ; GFX12-PAL-NEXT: s_addk_co_i32 s1, 0x4004 -; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT -; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) @@ -3142,13 +3190,13 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX12-LABEL: store_load_vindex_large_offset_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0 -; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:16388 th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:124 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:16388 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel: @@ -3250,13 +3298,13 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX12-PAL-LABEL: store_load_vindex_large_offset_kernel: ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX12-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0 -; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:16388 th:TH_STORE_NT_RT -; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:16388 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) @@ -3326,17 +3374,21 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; ; GFX12-LABEL: store_load_vindex_large_offset_foo: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 ; GFX12-NEXT: s_add_co_i32 s0, s32, 0x4000 -; GFX12-NEXT: scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_load_b32 v3, off, s32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX12-NEXT: scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v1, s32 offset:16384 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b32 v0, v2, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b32 v0, v1, s32 offset:16384 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo: @@ -3409,17 +3461,21 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; ; GFX12-PAL-LABEL: store_load_vindex_large_offset_foo: ; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-PAL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-PAL-NEXT: s_wait_expcnt 0x0 +; GFX12-PAL-NEXT: s_wait_samplecnt 0x0 +; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 ; GFX12-PAL-NEXT: s_add_co_i32 s0, s32, 0x4000 -; GFX12-PAL-NEXT: scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_load_b32 v3, off, s32 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0 ; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX12-PAL-NEXT: scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT -; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:16384 th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_store_b32 v0, v2, off scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:16384 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] bb: %padding = alloca [4096 x i32], align 4, addrspace(5) @@ -3485,12 +3541,12 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX12-LABEL: store_load_large_imm_offset_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 -; GFX12-NEXT: scratch_store_b32 off, v0, off offset:4 th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_store_b32 off, v1, off offset:16004 th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, off, off offset:16004 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b32 off, v0, off offset:4 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_store_b32 off, v1, off offset:16004 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b32 v0, off, off offset:16004 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel: @@ -3590,12 +3646,12 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX12-PAL-LABEL: store_load_large_imm_offset_kernel: ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 -; GFX12-PAL-NEXT: scratch_store_b32 off, v0, off offset:4 th:TH_STORE_NT_RT -; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-PAL-NEXT: scratch_store_b32 off, v1, off offset:16004 th:TH_STORE_NT_RT -; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off offset:16004 th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_store_b32 off, v0, off offset:4 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-NEXT: scratch_store_b32 off, v1, off offset:16004 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off offset:16004 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_endpgm bb: %i = alloca [4096 x i32], align 4, addrspace(5) @@ -3656,14 +3712,18 @@ define void @store_load_large_imm_offset_foo() { ; ; GFX12-LABEL: store_load_large_imm_offset_foo: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 -; GFX12-NEXT: scratch_store_b32 off, v0, s32 th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_store_b32 off, v1, s32 offset:16000 th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, off, s32 offset:16000 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b32 off, v0, s32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_store_b32 off, v1, s32 offset:16000 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b32 v0, off, s32 offset:16000 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_large_imm_offset_foo: @@ -3727,14 +3787,18 @@ define void @store_load_large_imm_offset_foo() { ; ; GFX12-PAL-LABEL: store_load_large_imm_offset_foo: ; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-PAL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-PAL-NEXT: s_wait_expcnt 0x0 +; GFX12-PAL-NEXT: s_wait_samplecnt 0x0 +; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 -; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s32 th:TH_STORE_NT_RT -; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-PAL-NEXT: scratch_store_b32 off, v1, s32 offset:16000 th:TH_STORE_NT_RT -; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s32 offset:16000 th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s32 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-NEXT: scratch_store_b32 off, v1, s32 offset:16000 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s32 offset:16000 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] bb: %i = alloca [4096 x i32], align 4, addrspace(5) @@ -3797,14 +3861,14 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX12-LABEL: store_load_vidx_sidx_offset: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s0, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, 4 -; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:1024 th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v0, off offset:1024 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:1024 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b32 v0, v0, off offset:1024 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_vidx_sidx_offset: @@ -3879,14 +3943,14 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX12-PAL-LABEL: store_load_vidx_sidx_offset: ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX12-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s0, v0 ; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, 4 -; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:1024 th:TH_STORE_NT_RT -; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-PAL-NEXT: scratch_load_b32 v0, v0, off offset:1024 th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:1024 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-NEXT: scratch_load_b32 v0, v0, off offset:1024 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_endpgm bb: %alloca = alloca [32 x i32], align 4, addrspace(5) @@ -3934,12 +3998,16 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) { ; ; GFX12-LABEL: store_load_i64_aligned: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 -; GFX12-NEXT: scratch_store_b64 v0, v[1:2], off th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b64 v[0:1], v0, off th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_i64_aligned: @@ -3987,12 +4055,16 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) { ; ; GFX12-PAL-LABEL: store_load_i64_aligned: ; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-PAL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-PAL-NEXT: s_wait_expcnt 0x0 +; GFX12-PAL-NEXT: s_wait_samplecnt 0x0 +; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 -; GFX12-PAL-NEXT: scratch_store_b64 v0, v[1:2], off th:TH_STORE_NT_RT -; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-PAL-NEXT: scratch_load_b64 v[0:1], v0, off th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] bb: store volatile i64 15, ptr addrspace(5) %arg, align 8 @@ -4035,12 +4107,16 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) { ; ; GFX12-LABEL: store_load_i64_unaligned: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 -; GFX12-NEXT: scratch_store_b64 v0, v[1:2], off th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b64 v[0:1], v0, off th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_i64_unaligned: @@ -4088,12 +4164,16 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) { ; ; GFX12-PAL-LABEL: store_load_i64_unaligned: ; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-PAL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-PAL-NEXT: s_wait_expcnt 0x0 +; GFX12-PAL-NEXT: s_wait_samplecnt 0x0 +; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 -; GFX12-PAL-NEXT: scratch_store_b64 v0, v[1:2], off th:TH_STORE_NT_RT -; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-PAL-NEXT: scratch_load_b64 v[0:1], v0, off th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] bb: store volatile i64 15, ptr addrspace(5) %arg, align 1 @@ -4139,13 +4219,17 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) { ; ; GFX12-LABEL: store_load_v3i32_unaligned: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX12-NEXT: v_mov_b32_e32 v3, 3 -; GFX12-NEXT: scratch_store_b96 v0, v[1:3], off th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b96 v[0:2], v0, off th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_v3i32_unaligned: @@ -4197,13 +4281,17 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) { ; ; GFX12-PAL-LABEL: store_load_v3i32_unaligned: ; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-PAL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-PAL-NEXT: s_wait_expcnt 0x0 +; GFX12-PAL-NEXT: s_wait_samplecnt 0x0 +; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX12-PAL-NEXT: v_mov_b32_e32 v3, 3 -; GFX12-PAL-NEXT: scratch_store_b96 v0, v[1:3], off th:TH_STORE_NT_RT -; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-PAL-NEXT: scratch_load_b96 v[0:2], v0, off th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-NEXT: scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] bb: store volatile <3 x i32> , ptr addrspace(5) %arg, align 1 @@ -4251,13 +4339,17 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) { ; ; GFX12-LABEL: store_load_v4i32_unaligned: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX12-NEXT: v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4 -; GFX12-NEXT: scratch_store_b128 v0, v[1:4], off th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b128 v[0:3], v0, off th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_v4i32_unaligned: @@ -4312,13 +4404,17 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) { ; ; GFX12-PAL-LABEL: store_load_v4i32_unaligned: ; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-PAL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-PAL-NEXT: s_wait_expcnt 0x0 +; GFX12-PAL-NEXT: s_wait_samplecnt 0x0 +; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX12-PAL-NEXT: v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4 -; GFX12-PAL-NEXT: scratch_store_b128 v0, v[1:4], off th:TH_STORE_NT_RT -; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-PAL-NEXT: scratch_load_b128 v[0:3], v0, off th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-NEXT: scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] bb: store volatile <4 x i32> , ptr addrspace(5) %arg, align 1 @@ -4360,12 +4456,16 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg) ; ; GFX12-LABEL: store_load_i32_negative_unaligned: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 1 -; GFX12-NEXT: scratch_store_b8 v0, v1, off offset:-1 th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_u8 v0, v0, off offset:-1 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_u8 v0, v0, off offset:-1 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_i32_negative_unaligned: @@ -4423,12 +4523,16 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg) ; ; GFX12-PAL-LABEL: store_load_i32_negative_unaligned: ; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-PAL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-PAL-NEXT: s_wait_expcnt 0x0 +; GFX12-PAL-NEXT: s_wait_samplecnt 0x0 +; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: v_mov_b32_e32 v1, 1 -; GFX12-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-1 th:TH_STORE_NT_RT -; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-1 th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-1 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] bb: %ptr = getelementptr inbounds i8, ptr addrspace(5) %arg, i32 -1 @@ -4472,12 +4576,16 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture ; ; GFX12-LABEL: store_load_i32_large_negative_unaligned: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 1 -; GFX12-NEXT: scratch_store_b8 v0, v1, off offset:-4225 th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_u8 v0, v0, off offset:-4225 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b8 v0, v1, off offset:-4225 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_i32_large_negative_unaligned: @@ -4536,12 +4644,16 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture ; ; GFX12-PAL-LABEL: store_load_i32_large_negative_unaligned: ; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-PAL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-PAL-NEXT: s_wait_expcnt 0x0 +; GFX12-PAL-NEXT: s_wait_samplecnt 0x0 +; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: v_mov_b32_e32 v1, 1 -; GFX12-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-4225 th:TH_STORE_NT_RT -; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-4225 th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-4225 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] bb: %ptr = getelementptr inbounds i8, ptr addrspace(5) %arg, i32 -4225 @@ -4626,10 +4738,10 @@ define amdgpu_ps void @large_offset() { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:3024 th:TH_STORE_NT_RT -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-NEXT: scratch_load_b128 v[0:3], off, off offset:3024 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:3024 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: scratch_load_b128 v[0:3], off, off offset:3024 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 16 :: v_dual_mov_b32 v1, 0x810 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use v0 @@ -4744,10 +4856,10 @@ define amdgpu_ps void @large_offset() { ; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-PAL-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0 ; GFX12-PAL-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:3024 th:TH_STORE_NT_RT -; GFX12-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX12-PAL-NEXT: scratch_load_b128 v[0:3], off, off offset:3024 th:TH_LOAD_RT_NT -; GFX12-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:3024 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-NEXT: scratch_load_b128 v[0:3], off, off offset:3024 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: v_dual_mov_b32 v0, 16 :: v_dual_mov_b32 v1, 0x810 ; GFX12-PAL-NEXT: ;;#ASMSTART ; GFX12-PAL-NEXT: ; use v0 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll index d7f780e414caed..524632cb2db336 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -37,12 +37,11 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_add_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] offset:32 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -95,11 +94,11 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -155,15 +154,14 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] offset:32 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -219,14 +217,14 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-LABEL: atomic_add_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -269,12 +267,11 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_add_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -322,11 +319,11 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -377,15 +374,14 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -436,14 +432,14 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_add_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -489,12 +485,11 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_and_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] offset:32 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -547,11 +542,11 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -607,15 +602,14 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] offset:32 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -671,14 +665,14 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-LABEL: atomic_and_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -721,12 +715,11 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_and_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -774,11 +767,11 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -829,15 +822,14 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -888,14 +880,14 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_and_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -941,12 +933,11 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_sub_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] offset:32 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -999,11 +990,11 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -1059,15 +1050,14 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] offset:32 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -1123,14 +1113,14 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -1173,12 +1163,11 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_sub_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -1226,11 +1215,11 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -1281,15 +1270,14 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -1340,14 +1328,14 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_sub_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -1391,12 +1379,11 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_max_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] offset:32 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -1449,11 +1436,11 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -1507,15 +1494,14 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] offset:32 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -1571,14 +1557,14 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -1619,12 +1605,11 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_max_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -1672,11 +1657,11 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -1725,15 +1710,14 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -1784,14 +1768,14 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_max_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -1835,12 +1819,11 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_umax_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] offset:32 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -1893,11 +1876,11 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -1951,15 +1934,14 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] offset:32 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -2015,14 +1997,14 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -2063,12 +2045,11 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_umax_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -2116,11 +2097,11 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -2169,15 +2150,14 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -2228,14 +2208,14 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-LABEL: atomic_umax_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -2279,12 +2259,11 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_min_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] offset:32 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -2337,11 +2316,11 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -2395,15 +2374,14 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] offset:32 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -2459,14 +2437,14 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -2507,12 +2485,11 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_min_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -2560,11 +2537,11 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -2613,15 +2590,14 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -2672,14 +2648,14 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_min_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -2723,12 +2699,11 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_umin_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] offset:32 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -2781,11 +2756,11 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -2839,15 +2814,14 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] offset:32 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -2903,14 +2877,14 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -2951,12 +2925,11 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_umin_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -3004,11 +2977,11 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -3057,15 +3030,14 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -3116,14 +3088,14 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-LABEL: atomic_umin_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -3169,12 +3141,11 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_or_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] offset:32 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -3227,11 +3198,11 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -3287,15 +3258,14 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] offset:32 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -3351,14 +3321,14 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-LABEL: atomic_or_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -3401,12 +3371,11 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_or_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -3454,11 +3423,11 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -3509,15 +3478,14 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -3568,14 +3536,14 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; GFX12-LABEL: atomic_or_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -3621,12 +3589,11 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_xchg_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -3669,12 +3636,11 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; GFX12-LABEL: atomic_xchg_f64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: @@ -3717,12 +3683,11 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; GFX12-LABEL: atomic_xchg_pointer_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: @@ -3775,11 +3740,11 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -3835,15 +3800,14 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] offset:32 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -3899,14 +3863,14 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -3949,12 +3913,11 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_xchg_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -4002,11 +3965,11 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -4057,15 +4020,14 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -4116,14 +4078,14 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-LABEL: atomic_xchg_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -4169,12 +4131,11 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_xor_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] offset:32 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -4227,11 +4188,11 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -4287,15 +4248,14 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] offset:32 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -4351,14 +4311,14 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -4401,12 +4361,11 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_xor_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -4454,11 +4413,11 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -4509,15 +4468,14 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -4568,14 +4526,14 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_xor_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -4623,11 +4581,11 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { ; GFX12-LABEL: atomic_load_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -4670,11 +4628,11 @@ define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) { ; GFX12-LABEL: atomic_load_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -4730,14 +4688,14 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -4791,14 +4749,14 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -4839,7 +4797,7 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) { ; GFX12-LABEL: atomic_store_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 @@ -4876,7 +4834,7 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) { ; GFX12-LABEL: atomic_store_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -4926,7 +4884,7 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] @@ -4977,7 +4935,7 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] @@ -5033,13 +4991,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -5090,13 +5047,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:72000 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -5149,12 +5105,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -5211,15 +5167,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -5283,14 +5238,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[2:3] ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -5342,13 +5297,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -5396,12 +5350,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, ; GFX12-LABEL: atomic_cmpxchg_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -5453,15 +5407,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; GFX12-LABEL: atomic_cmpxchg_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -5520,14 +5473,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[2:3] ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -5576,11 +5529,11 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; GFX12-LABEL: atomic_load_f64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -5623,11 +5576,11 @@ define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) { ; GFX12-LABEL: atomic_load_f64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -5683,14 +5636,14 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -5744,14 +5697,14 @@ define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -5792,7 +5745,7 @@ define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { ; GFX12-LABEL: atomic_store_f64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 @@ -5829,7 +5782,7 @@ define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) { ; GFX12-LABEL: atomic_store_f64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -5879,7 +5832,7 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] @@ -5930,7 +5883,7 @@ define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %in ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] @@ -5978,12 +5931,11 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_inc_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:32 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -6036,11 +5988,11 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -6096,15 +6048,14 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -6160,14 +6111,14 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GFX12-LABEL: atomic_inc_i64_ret_incr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -6210,12 +6161,11 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_inc_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -6263,11 +6213,11 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -6318,15 +6268,14 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -6377,14 +6326,14 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_inc_i64_ret_incr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -6430,12 +6379,11 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_dec_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] offset:32 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -6488,11 +6436,11 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -6548,15 +6496,14 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] offset:32 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -6612,14 +6559,14 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GFX12-LABEL: atomic_dec_i64_ret_decr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -6662,12 +6609,11 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_dec_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -6715,11 +6661,11 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -6770,15 +6716,14 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -6829,14 +6774,14 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_dec_i64_ret_decr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll index 4fe2a4ad2a2fba..4c716444604618 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll @@ -253,11 +253,11 @@ define amdgpu_kernel void @fmaximumi_f32_move_to_valu(ptr addrspace(1) %out, ptr ; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_b32 v1, v0, s[6:7] th:TH_LOAD_RT_NT -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_load_b32 v2, v0, s[0:1] th:TH_LOAD_RT_NT -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: global_load_b32 v1, v0, s[6:7] scope:SCOPE_SYS +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: global_load_b32 v2, v0, s[0:1] scope:SCOPE_SYS +; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: v_maximum_f32 v1, v1, v2 ; GCN-NEXT: global_store_b32 v0, v1, s[4:5] ; GCN-NEXT: s_nop 0 @@ -277,11 +277,11 @@ define amdgpu_kernel void @fmaximum_f16_move_to_valu(ptr addrspace(1) %out, ptr ; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_u16 v1, v0, s[6:7] th:TH_LOAD_RT_NT -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_load_u16 v2, v0, s[0:1] th:TH_LOAD_RT_NT -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: global_load_u16 v1, v0, s[6:7] scope:SCOPE_SYS +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: global_load_u16 v2, v0, s[0:1] scope:SCOPE_SYS +; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: v_maximum_f16 v1, v1, v2 ; GCN-NEXT: global_store_b16 v0, v1, s[4:5] ; GCN-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll index b63a4fa40b5913..5d1b8b7ad0512f 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll @@ -253,11 +253,11 @@ define amdgpu_kernel void @fminimumi_f32_move_to_valu(ptr addrspace(1) %out, ptr ; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_b32 v1, v0, s[6:7] th:TH_LOAD_RT_NT -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_load_b32 v2, v0, s[0:1] th:TH_LOAD_RT_NT -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: global_load_b32 v1, v0, s[6:7] scope:SCOPE_SYS +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: global_load_b32 v2, v0, s[0:1] scope:SCOPE_SYS +; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: v_minimum_f32 v1, v1, v2 ; GCN-NEXT: global_store_b32 v0, v1, s[4:5] ; GCN-NEXT: s_nop 0 @@ -277,11 +277,11 @@ define amdgpu_kernel void @fminimum_f16_move_to_valu(ptr addrspace(1) %out, ptr ; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_u16 v1, v0, s[6:7] th:TH_LOAD_RT_NT -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_load_u16 v2, v0, s[0:1] th:TH_LOAD_RT_NT -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: global_load_u16 v1, v0, s[6:7] scope:SCOPE_SYS +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: global_load_u16 v2, v0, s[0:1] scope:SCOPE_SYS +; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: v_minimum_f16 v1, v1, v2 ; GCN-NEXT: global_store_b16 v0, v1, s[4:5] ; GCN-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-num-flat-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-num-flat-atomics.ll index 5c3e83c4b9cfe1..48e8d87129c590 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-num-flat-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-num-flat-atomics.ll @@ -27,9 +27,9 @@ define amdgpu_cs float @flat_atomic_fmin_num_f32_rtn(ptr %ptr, float %data, ptr ; GFX12-LABEL: flat_atomic_fmin_num_f32_rtn: ; GFX12: ; %bb.0: ; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b32 v[3:4], v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %ret = call float @llvm.amdgcn.flat.atomic.fmin.num.f32.p1.f32(ptr %ptr, float %data) store float %ret, ptr %out @@ -40,9 +40,9 @@ define amdgpu_cs float @flat_atomic_fmax_num_f32_rtn(ptr %ptr, float %data, ptr ; GFX12-LABEL: flat_atomic_fmax_num_f32_rtn: ; GFX12: ; %bb.0: ; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b32 v[3:4], v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %ret = call float @llvm.amdgcn.flat.atomic.fmax.num.f32.p1.f32(ptr %ptr, float %data) store float %ret, ptr %out diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll index cf069ee92639f8..69c2c60e4ae2f8 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll @@ -31,7 +31,7 @@ define amdgpu_cs void @global_atomic_fmax_num_f32_rtn(ptr addrspace(1) %ptr, flo ; GFX12-LABEL: global_atomic_fmax_num_f32_rtn: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[3:4], v0, off ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -45,7 +45,7 @@ define amdgpu_cs void @global_atomic_fmin_num_f32_rtn(ptr addrspace(1) %ptr, flo ; GFX12-LABEL: global_atomic_fmin_num_f32_rtn: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[3:4], v0, off ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index b2b3f3e1bfbd96..2cbc346836212e 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -31,7 +31,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0(ptr addrspace(1) inreg %sb ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %load = load i8, ptr addrspace(1) %sbase %zext = zext i8 %load to i32 @@ -66,7 +66,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4095(ptr addrspace(1) inreg ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4095 %load = load i8, ptr addrspace(1) %gep0 @@ -95,7 +95,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4096(ptr addrspace(1) inreg ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:4096 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4096 %load = load i8, ptr addrspace(1) %gep0 @@ -124,7 +124,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4097(ptr addrspace(1) inreg ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:4097 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4097 %load = load i8, ptr addrspace(1) %gep0 @@ -161,7 +161,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4096(ptr addrspace(1) inr ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4096 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4096 %load = load i8, ptr addrspace(1) %gep0 @@ -202,7 +202,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4097(ptr addrspace(1) inr ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4097 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4097 %load = load i8, ptr addrspace(1) %gep0 @@ -243,7 +243,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4098(ptr addrspace(1) inr ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4098 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4098 %load = load i8, ptr addrspace(1) %gep0 @@ -279,7 +279,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_2048(ptr addrspace(1) inreg ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:2048 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2048 %load = load i8, ptr addrspace(1) %gep0 @@ -315,7 +315,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_2049(ptr addrspace(1) inreg ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:2049 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2049 %load = load i8, ptr addrspace(1) %gep0 @@ -351,7 +351,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_2050(ptr addrspace(1) inreg ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:2050 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2050 %load = load i8, ptr addrspace(1) %gep0 @@ -380,7 +380,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2048(ptr addrspace(1) inr ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2048 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2048 %load = load i8, ptr addrspace(1) %gep0 @@ -417,7 +417,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2049(ptr addrspace(1) inr ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2049 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2049 %load = load i8, ptr addrspace(1) %gep0 @@ -454,7 +454,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2050(ptr addrspace(1) inr ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2050 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2050 %load = load i8, ptr addrspace(1) %gep0 @@ -489,7 +489,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x7FFFFF(ptr addrspace(1) in ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:8388607 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 8388607 %load = load i8, ptr addrspace(1) %gep0 @@ -529,7 +529,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0xFFFFFF(ptr addrspace(1) in ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-8388608 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -8388608 %load = load i8, ptr addrspace(1) %gep0 @@ -564,7 +564,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0xFFFFFFFF(ptr addrspace(1) ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v0, 0xff800000 ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:8388607 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967295 %load = load i8, ptr addrspace(1) %gep0 @@ -606,7 +606,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x100000000(ptr addrspace(1) ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1] ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_0x100000000: @@ -616,7 +616,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x100000000(ptr addrspace(1) ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967296 %load = load i8, ptr addrspace(1) %gep0 @@ -658,7 +658,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x100000001(ptr addrspace(1) ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1] ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:1 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_0x100000001: @@ -668,7 +668,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x100000001(ptr addrspace(1) ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967297 %load = load i8, ptr addrspace(1) %gep0 @@ -710,7 +710,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x100000FFF(ptr addrspace(1) ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1] ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_0x100000FFF: @@ -720,7 +720,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x100000FFF(ptr addrspace(1) ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294971391 %load = load i8, ptr addrspace(1) %gep0 @@ -762,7 +762,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x100001000(ptr addrspace(1) ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1] ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4096 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_0x100001000: @@ -772,7 +772,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x100001000(ptr addrspace(1) ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294971392 %load = load i8, ptr addrspace(1) %gep0 @@ -815,7 +815,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg0xFFFFFFFF(ptr addrspace( ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1] ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8388607 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg0xFFFFFFFF: @@ -825,7 +825,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg0xFFFFFFFF(ptr addrspace( ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967295 %load = load i8, ptr addrspace(1) %gep0 @@ -867,7 +867,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg0x100000000(ptr addrspace ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1] ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg0x100000000: @@ -877,7 +877,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg0x100000000(ptr addrspace ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967296 %load = load i8, ptr addrspace(1) %gep0 @@ -919,7 +919,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg0x100000001(ptr addrspace ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1] ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-1 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg0x100000001: @@ -929,7 +929,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg0x100000001(ptr addrspace ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967297 %load = load i8, ptr addrspace(1) %gep0 @@ -959,7 +959,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr(ptr addrspace(1) inreg %s ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -996,7 +996,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095(ptr addrspace ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1045,7 +1045,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4096(ptr addrspace ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:4096 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1083,7 +1083,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4096(ptr addrsp ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4096 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1132,7 +1132,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4097(ptr addrsp ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4097 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1160,7 +1160,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2047(ptr addrspace ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_2047: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:2047 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1198,7 +1198,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2048(ptr addrspace ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:2048 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1226,7 +1226,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2048(ptr addrsp ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2048: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2048 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1264,7 +1264,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2049(ptr addrsp ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2049 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1313,7 +1313,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_0x7FFFFF(ptr addrs ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_0x7FFFFF: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:8388607 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 8388607 @@ -1361,7 +1361,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_0xFFFFFF(ptr addrs ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_0xFFFFFF: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-8388608 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -8388608 @@ -1399,7 +1399,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095_gep_order(ptr ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4095 @@ -1427,7 +1427,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint(ptr addrspace(1) ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64 @@ -1456,7 +1456,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add(ptr ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64 @@ -1485,7 +1485,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_ ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:128 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64 @@ -1515,7 +1515,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_ ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:128 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64 @@ -1574,22 +1574,22 @@ define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) { ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-SDAG-NEXT: ds_load_b64 v[1:2], v1 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v1 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v2 ; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_uniform_ptr_in_vgprs: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: ds_load_b64 v[1:2], v1 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc, v1, v0 ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v2, vcc ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 @@ -1640,22 +1640,22 @@ define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %vo ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-SDAG-NEXT: ds_load_b64 v[1:2], v1 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v1 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v2 ; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] offset:42 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: ds_load_b64 v[1:2], v1 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc, v1, v0 ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v2, vcc ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:42 -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 @@ -1687,7 +1687,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset(ptr addrspace(1 ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1717,7 +1717,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset_immoffset(ptr a ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-24 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1748,7 +1748,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(ptr ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64 @@ -1780,7 +1780,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_ ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:128 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64 @@ -1824,7 +1824,7 @@ define amdgpu_ps float @global_load_i8_vgpr64_sgpr32(ptr addrspace(1) %vbase, i3 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc, v0, s2 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_i8_vgpr64_sgpr32: @@ -1836,7 +1836,7 @@ define amdgpu_ps float @global_load_i8_vgpr64_sgpr32(ptr addrspace(1) %vbase, i3 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc, v0, v2 ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %vbase, i64 %zext.offset @@ -1879,7 +1879,7 @@ define amdgpu_ps float @global_load_i8_vgpr64_sgpr32_offset_4095(ptr addrspace(1 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc, v0, s2 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095: @@ -1891,7 +1891,7 @@ define amdgpu_ps float @global_load_i8_vgpr64_sgpr32_offset_4095(ptr addrspace(1 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc, v0, v2 ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:4095 -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %vbase, i64 %zext.offset @@ -1951,14 +1951,14 @@ define amdgpu_ps float @global_load_saddr_f32_natural_addressing(ptr addrspace(1 ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc, s2, v0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc ; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_f32_natural_addressing: @@ -1967,14 +1967,14 @@ define amdgpu_ps float @global_load_saddr_f32_natural_addressing(ptr addrspace(1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s3 -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc, v2, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, v3, v1, vcc ; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %voffset = load i32, ptr addrspace(1) %voffset.ptr %zext.offset = zext i32 %voffset to i64 @@ -2004,9 +2004,9 @@ define amdgpu_ps float @global_load_saddr_f32_natural_addressing_immoffset(ptr a ; GFX12-LABEL: global_load_saddr_f32_natural_addressing_immoffset: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b32 v0, v[0:1], off -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:128 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %voffset = load i32, ptr addrspace(1) %voffset.ptr %zext.offset = zext i32 %voffset to i64 @@ -2039,10 +2039,10 @@ define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range(ptr addrspace(1) i ; GFX12-LABEL: global_load_f32_saddr_zext_vgpr_range: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b32 v0, v[0:1], off -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !0, !noundef !{} %zext.offset = zext i32 %voffset to i64 @@ -2074,10 +2074,10 @@ define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_imm_offset(ptr add ; GFX12-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b32 v0, v[0:1], off -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:400 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !0, !noundef !{} %zext.offset = zext i32 %voffset to i64 @@ -2132,14 +2132,14 @@ define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_too_large(ptr addr ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc, s2, v0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc ; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large: @@ -2148,14 +2148,14 @@ define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_too_large(ptr addr ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s3 -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc, v2, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, v3, v1, vcc ; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !1, !noundef !{} %zext.offset = zext i32 %voffset to i64 @@ -2184,7 +2184,7 @@ define amdgpu_ps half @global_load_saddr_i16(ptr addrspace(1) inreg %sbase, i32 ; GFX12-LABEL: global_load_saddr_i16: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_u16 v0, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2209,7 +2209,7 @@ define amdgpu_ps half @global_load_saddr_i16_immneg128(ptr addrspace(1) inreg %s ; GFX12-LABEL: global_load_saddr_i16_immneg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2235,7 +2235,7 @@ define amdgpu_ps half @global_load_saddr_f16(ptr addrspace(1) inreg %sbase, i32 ; GFX12-LABEL: global_load_saddr_f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_u16 v0, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2259,7 +2259,7 @@ define amdgpu_ps half @global_load_saddr_f16_immneg128(ptr addrspace(1) inreg %s ; GFX12-LABEL: global_load_saddr_f16_immneg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2284,7 +2284,7 @@ define amdgpu_ps float @global_load_saddr_i32(ptr addrspace(1) inreg %sbase, i32 ; GFX12-LABEL: global_load_saddr_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2309,7 +2309,7 @@ define amdgpu_ps float @global_load_saddr_i32_immneg128(ptr addrspace(1) inreg % ; GFX12-LABEL: global_load_saddr_i32_immneg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2335,7 +2335,7 @@ define amdgpu_ps float @global_load_saddr_f32(ptr addrspace(1) inreg %sbase, i32 ; GFX12-LABEL: global_load_saddr_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2359,7 +2359,7 @@ define amdgpu_ps float @global_load_saddr_f32_immneg128(ptr addrspace(1) inreg % ; GFX12-LABEL: global_load_saddr_f32_immneg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2384,7 +2384,7 @@ define amdgpu_ps <2 x half> @global_load_saddr_v2i16(ptr addrspace(1) inreg %sba ; GFX12-LABEL: global_load_saddr_v2i16: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2409,7 +2409,7 @@ define amdgpu_ps <2 x half> @global_load_saddr_v2i16_immneg128(ptr addrspace(1) ; GFX12-LABEL: global_load_saddr_v2i16_immneg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2435,7 +2435,7 @@ define amdgpu_ps <2 x half> @global_load_saddr_v2f16(ptr addrspace(1) inreg %sba ; GFX12-LABEL: global_load_saddr_v2f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2459,7 +2459,7 @@ define amdgpu_ps <2 x half> @global_load_saddr_v2f16_immneg128(ptr addrspace(1) ; GFX12-LABEL: global_load_saddr_v2f16_immneg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2484,7 +2484,7 @@ define amdgpu_ps <2 x half> @global_load_saddr_p3(ptr addrspace(1) inreg %sbase, ; GFX12-LABEL: global_load_saddr_p3: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2510,7 +2510,7 @@ define amdgpu_ps <2 x half> @global_load_saddr_p3_immneg128(ptr addrspace(1) inr ; GFX12-LABEL: global_load_saddr_p3_immneg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2537,7 +2537,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_f64(ptr addrspace(1) inreg %sbas ; GFX12-LABEL: global_load_saddr_f64: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2562,7 +2562,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_f64_immneg128(ptr addrspace(1) i ; GFX12-LABEL: global_load_saddr_f64_immneg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2588,7 +2588,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_i64(ptr addrspace(1) inreg %sbas ; GFX12-LABEL: global_load_saddr_i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2613,7 +2613,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_i64_immneg128(ptr addrspace(1) i ; GFX12-LABEL: global_load_saddr_i64_immneg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2639,7 +2639,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_v2f32(ptr addrspace(1) inreg %sb ; GFX12-LABEL: global_load_saddr_v2f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2663,7 +2663,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_v2f32_immneg128(ptr addrspace(1) ; GFX12-LABEL: global_load_saddr_v2f32_immneg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2688,7 +2688,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_v2i32(ptr addrspace(1) inreg %sb ; GFX12-LABEL: global_load_saddr_v2i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2713,7 +2713,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_v2i32_immneg128(ptr addrspace(1) ; GFX12-LABEL: global_load_saddr_v2i32_immneg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2739,7 +2739,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_v4i16(ptr addrspace(1) inreg %sb ; GFX12-LABEL: global_load_saddr_v4i16: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2764,7 +2764,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_v4i16_immneg128(ptr addrspace(1) ; GFX12-LABEL: global_load_saddr_v4i16_immneg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2790,7 +2790,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_v4f16(ptr addrspace(1) inreg %sb ; GFX12-LABEL: global_load_saddr_v4f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2815,7 +2815,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_v4f16_immneg128(ptr addrspace(1) ; GFX12-LABEL: global_load_saddr_v4f16_immneg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2841,7 +2841,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_p1(ptr addrspace(1) inreg %sbase ; GFX12-LABEL: global_load_saddr_p1: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2867,7 +2867,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_p1_immneg128(ptr addrspace(1) in ; GFX12-LABEL: global_load_saddr_p1_immneg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2894,7 +2894,7 @@ define amdgpu_ps <3 x float> @global_load_saddr_v3f32(ptr addrspace(1) inreg %sb ; GFX12-LABEL: global_load_saddr_v3f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b96 v[0:2], v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2918,7 +2918,7 @@ define amdgpu_ps <3 x float> @global_load_saddr_v3f32_immneg128(ptr addrspace(1) ; GFX12-LABEL: global_load_saddr_v3f32_immneg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b96 v[0:2], v0, s[2:3] offset:-128 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2943,7 +2943,7 @@ define amdgpu_ps <3 x float> @global_load_saddr_v3i32(ptr addrspace(1) inreg %sb ; GFX12-LABEL: global_load_saddr_v3i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b96 v[0:2], v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2968,7 +2968,7 @@ define amdgpu_ps <3 x float> @global_load_saddr_v3i32_immneg128(ptr addrspace(1) ; GFX12-LABEL: global_load_saddr_v3i32_immneg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b96 v[0:2], v0, s[2:3] offset:-128 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2994,7 +2994,7 @@ define amdgpu_ps <6 x half> @global_load_saddr_v6f16(ptr addrspace(1) inreg %sba ; GFX12-LABEL: global_load_saddr_v6f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b96 v[0:2], v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3018,7 +3018,7 @@ define amdgpu_ps <6 x half> @global_load_saddr_v6f16_immneg128(ptr addrspace(1) ; GFX12-LABEL: global_load_saddr_v6f16_immneg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b96 v[0:2], v0, s[2:3] offset:-128 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3043,7 +3043,7 @@ define amdgpu_ps <4 x float> @global_load_saddr_v4f32(ptr addrspace(1) inreg %sb ; GFX12-LABEL: global_load_saddr_v4f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3067,7 +3067,7 @@ define amdgpu_ps <4 x float> @global_load_saddr_v4f32_immneg128(ptr addrspace(1) ; GFX12-LABEL: global_load_saddr_v4f32_immneg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3092,7 +3092,7 @@ define amdgpu_ps <4 x float> @global_load_saddr_v4i32(ptr addrspace(1) inreg %sb ; GFX12-LABEL: global_load_saddr_v4i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3117,7 +3117,7 @@ define amdgpu_ps <4 x float> @global_load_saddr_v4i32_immneg128(ptr addrspace(1) ; GFX12-LABEL: global_load_saddr_v4i32_immneg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3143,7 +3143,7 @@ define amdgpu_ps <4 x float> @global_load_saddr_v2i64(ptr addrspace(1) inreg %sb ; GFX12-LABEL: global_load_saddr_v2i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3168,7 +3168,7 @@ define amdgpu_ps <4 x float> @global_load_saddr_v2i64_immneg128(ptr addrspace(1) ; GFX12-LABEL: global_load_saddr_v2i64_immneg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3194,7 +3194,7 @@ define amdgpu_ps <4 x float> @global_load_saddr_i128(ptr addrspace(1) inreg %sba ; GFX12-LABEL: global_load_saddr_i128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3219,7 +3219,7 @@ define amdgpu_ps <4 x float> @global_load_saddr_i128_immneg128(ptr addrspace(1) ; GFX12-LABEL: global_load_saddr_i128_immneg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3245,7 +3245,7 @@ define amdgpu_ps <4 x float> @global_load_saddr_v2p1(ptr addrspace(1) inreg %sba ; GFX12-LABEL: global_load_saddr_v2p1: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3271,7 +3271,7 @@ define amdgpu_ps <4 x float> @global_load_saddr_v2p1_immneg128(ptr addrspace(1) ; GFX12-LABEL: global_load_saddr_v2p1_immneg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3298,7 +3298,7 @@ define amdgpu_ps <4 x float> @global_load_saddr_v4p3(ptr addrspace(1) inreg %sba ; GFX12-LABEL: global_load_saddr_v4p3: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3324,7 +3324,7 @@ define amdgpu_ps <4 x float> @global_load_saddr_v4p3_immneg128(ptr addrspace(1) ; GFX12-LABEL: global_load_saddr_v4p3_immneg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3355,7 +3355,7 @@ define amdgpu_ps float @global_sextload_saddr_i8(ptr addrspace(1) inreg %sbase, ; GFX12-LABEL: global_sextload_saddr_i8: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_i8 v0, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3381,7 +3381,7 @@ define amdgpu_ps float @global_sextload_saddr_i8_immneg128(ptr addrspace(1) inre ; GFX12-LABEL: global_sextload_saddr_i8_immneg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_i8 v0, v0, s[2:3] offset:-128 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3408,7 +3408,7 @@ define amdgpu_ps float @global_sextload_saddr_i16(ptr addrspace(1) inreg %sbase, ; GFX12-LABEL: global_sextload_saddr_i16: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_i16 v0, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3434,7 +3434,7 @@ define amdgpu_ps float @global_sextload_saddr_i16_immneg128(ptr addrspace(1) inr ; GFX12-LABEL: global_sextload_saddr_i16_immneg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_i16 v0, v0, s[2:3] offset:-128 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3461,7 +3461,7 @@ define amdgpu_ps float @global_zextload_saddr_i8(ptr addrspace(1) inreg %sbase, ; GFX12-LABEL: global_zextload_saddr_i8: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3487,7 +3487,7 @@ define amdgpu_ps float @global_zextload_saddr_i8_immneg128(ptr addrspace(1) inre ; GFX12-LABEL: global_zextload_saddr_i8_immneg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-128 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3514,7 +3514,7 @@ define amdgpu_ps float @global_zextload_saddr_i16(ptr addrspace(1) inreg %sbase, ; GFX12-LABEL: global_zextload_saddr_i16: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_u16 v0, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3540,7 +3540,7 @@ define amdgpu_ps float @global_zextload_saddr_i16_immneg128(ptr addrspace(1) inr ; GFX12-LABEL: global_zextload_saddr_i16_immneg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3582,7 +3582,7 @@ define amdgpu_ps float @atomic_global_load_saddr_i32(ptr addrspace(1) inreg %sba ; GFX12-LABEL: atomic_global_load_saddr_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] th:TH_LOAD_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 @@ -3619,7 +3619,7 @@ define amdgpu_ps float @atomic_global_load_saddr_i32_immneg128(ptr addrspace(1) ; GFX12-LABEL: atomic_global_load_saddr_i32_immneg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 th:TH_LOAD_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 @@ -3657,7 +3657,7 @@ define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64(ptr addrspace(1) inre ; GFX12-LABEL: atomic_global_load_saddr_i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] th:TH_LOAD_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 @@ -3694,7 +3694,7 @@ define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64_immneg128(ptr addrspa ; GFX12-LABEL: atomic_global_load_saddr_i64_immneg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 th:TH_LOAD_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 @@ -3725,13 +3725,13 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi(ptr addrspace( ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_undef_hi: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: global_load_d16_b16 v0, v0, s[2:3] -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_undef_hi: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3757,13 +3757,13 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi_immneg128(ptr ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: global_load_d16_b16 v0, v0, s[2:3] offset:-128 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3795,14 +3795,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi(ptr addrspace(1 ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-SDAG-NEXT: global_load_d16_b16 v1, v0, s[2:3] -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_zero_hi: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 @@ -3834,14 +3834,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi_immneg128(ptr a ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-SDAG-NEXT: global_load_d16_b16 v1, v0, s[2:3] offset:-128 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 @@ -3871,14 +3871,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi(ptr addrspace(1) ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_reg_hi: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: global_load_d16_b16 v1, v0, s[2:3] -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_reg_hi: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 @@ -3907,14 +3907,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi_immneg128(ptr ad ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: global_load_d16_b16 v1, v0, s[2:3] offset:-128 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 @@ -3944,14 +3944,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi(ptr addrs ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: global_load_d16_u8 v1, v0, s[2:3] -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[2:3] -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 @@ -3981,14 +3981,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128 ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: global_load_d16_u8 v1, v0, s[2:3] offset:-128 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[2:3] offset:-128 -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 @@ -4019,14 +4019,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi(ptr addrs ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: global_load_d16_i8 v1, v0, s[2:3] -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: global_load_i8 v0, v0, s[2:3] -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 @@ -4058,14 +4058,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128 ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: global_load_d16_i8 v1, v0, s[2:3] offset:-128 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: global_load_i8 v0, v0, s[2:3] offset:-128 -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 @@ -4100,13 +4100,13 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi(ptr addrspace( ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_undef_hi: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: global_load_d16_hi_b16 v0, v0, s[2:3] -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_undef_hi: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 @@ -4133,13 +4133,13 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi_immneg128(ptr ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: global_load_d16_hi_b16 v0, v0, s[2:3] offset:-128 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 @@ -4172,14 +4172,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi(ptr addrspace(1 ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-SDAG-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_zero_hi: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 @@ -4211,14 +4211,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi_immneg128(ptr a ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-SDAG-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 @@ -4248,14 +4248,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi(ptr addrspace(1) ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_reg_hi: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_reg_hi: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 @@ -4286,14 +4286,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi_immneg128(ptr ad ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 @@ -4325,14 +4325,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi(ptr addrs ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: global_load_d16_hi_u8 v1, v0, s[2:3] -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[2:3] -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 @@ -4364,14 +4364,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128 ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: global_load_d16_hi_u8 v1, v0, s[2:3] offset:-128 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[2:3] offset:-128 -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 @@ -4404,14 +4404,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi(ptr addrs ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3] -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: global_load_i8 v0, v0, s[2:3] -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -4444,14 +4444,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128 ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3] offset:-128 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: global_load_i8 v0, v0, s[2:3] offset:-128 -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -4494,7 +4494,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_16(ptr add ; GFX12-NEXT: v_or_b32_e32 v0, 16, v0 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.idx = zext i32 %idx to i64 %or = or i64 %zext.idx, 16 @@ -4527,7 +4527,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr a ; GFX12-NEXT: v_or_b32_e32 v0, 0x1040, v0 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %zext.idx = zext i32 %idx to i64 %or = or i64 %zext.idx, 4160 @@ -4605,8 +4605,8 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) { ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1] ; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4 -; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[4:5] th:TH_LOAD_RT_NT -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400 ; GFX12-SDAG-NEXT: s_cbranch_scc0 .LBB132_1 ; GFX12-SDAG-NEXT: ; %bb.2: ; %bb2 @@ -4626,8 +4626,8 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) { ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v1, v3, vcc ; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc, v2, 4 ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc, 0, v3, vcc -; GFX12-GISEL-NEXT: global_load_b32 v4, v[4:5], off th:TH_LOAD_RT_NT -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: global_load_b32 v4, v[4:5], off scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0x400, v2 ; GFX12-GISEL-NEXT: s_cbranch_vccz .LBB132_1 ; GFX12-GISEL-NEXT: ; %bb.2: ; %bb2 @@ -4721,10 +4721,10 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1] ; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4 -; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[4:5] th:TH_LOAD_RT_NT -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[4:5] th:TH_LOAD_RT_NT -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400 ; GFX12-SDAG-NEXT: s_cbranch_scc0 .LBB133_1 ; GFX12-SDAG-NEXT: ; %bb.2: ; %bb2 @@ -4744,10 +4744,10 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v1, v3, vcc ; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc, v2, 4 ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc, 0, v3, vcc -; GFX12-GISEL-NEXT: global_load_b32 v6, v[4:5], off th:TH_LOAD_RT_NT -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX12-GISEL-NEXT: global_load_b32 v4, v[4:5], off th:TH_LOAD_RT_NT -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: global_load_b32 v6, v[4:5], off scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_load_b32 v4, v[4:5], off scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0x400, v2 ; GFX12-GISEL-NEXT: s_cbranch_vccz .LBB133_1 ; GFX12-GISEL-NEXT: ; %bb.2: ; %bb2 diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll index 2a4a3ad2e3bcaa..1102f9d0f1a5fd 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll @@ -26,7 +26,7 @@ define amdgpu_ps void @global_store_saddr_i8_zext_vgpr(ptr addrspace(1) inreg %s ; GFX12-LABEL: global_store_saddr_i8_zext_vgpr: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b32 v0, v[0:1], off -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v0, v2, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -59,7 +59,7 @@ define amdgpu_ps void @global_store_saddr_i8_zext_vgpr_offset_2047(ptr addrspace ; GFX12-LABEL: global_store_saddr_i8_zext_vgpr_offset_2047: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b32 v0, v[0:1], off -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v0, v2, s[2:3] offset:2047 ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -93,7 +93,7 @@ define amdgpu_ps void @global_store_saddr_i8_zext_vgpr_offset_neg2048(ptr addrsp ; GFX12-LABEL: global_store_saddr_i8_zext_vgpr_offset_neg2048: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_b32 v0, v[0:1], off -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v0, v2, s[2:3] offset:-2048 ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -151,7 +151,7 @@ define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, i8 ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: ds_load_b64 v[2:3], v2 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s0, v2 ; GFX12-NEXT: v_readfirstlane_b32 s1, v3 ; GFX12-NEXT: global_store_b8 v0, v1, s[0:1] @@ -204,7 +204,7 @@ define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs_immoffset(i32 %vo ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: ds_load_b64 v[2:3], v2 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s0, v2 ; GFX12-NEXT: v_readfirstlane_b32 s1, v3 ; GFX12-NEXT: global_store_b8 v0, v1, s[0:1] offset:-120 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll index 325dae172d5236..0413e21b9c215a 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll @@ -46,11 +46,11 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-LABEL: atomic_add_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] offset:32 -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -117,11 +117,11 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -193,13 +193,13 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] ; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] offset:32 -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -273,14 +273,14 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX12-LABEL: atomic_add_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -340,11 +340,11 @@ define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-LABEL: atomic_add_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -410,11 +410,11 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -483,13 +483,13 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] ; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -560,14 +560,14 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_add_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -622,11 +622,11 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-LABEL: atomic_and_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] offset:32 -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -693,11 +693,11 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -769,13 +769,13 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] ; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] offset:32 -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -849,14 +849,14 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX12-LABEL: atomic_and_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -916,11 +916,11 @@ define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-LABEL: atomic_and_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -986,11 +986,11 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -1059,13 +1059,13 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] ; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -1136,14 +1136,14 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_and_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -1198,11 +1198,11 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-LABEL: atomic_sub_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] offset:32 -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -1269,11 +1269,11 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -1345,13 +1345,13 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] ; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] offset:32 -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -1425,14 +1425,14 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -1492,11 +1492,11 @@ define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-LABEL: atomic_sub_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -1562,11 +1562,11 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -1635,13 +1635,13 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] ; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -1712,14 +1712,14 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_sub_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -1768,11 +1768,11 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-LABEL: atomic_max_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] offset:32 -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -1836,11 +1836,11 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -1906,13 +1906,13 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] ; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] offset:32 -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -1983,14 +1983,14 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX12-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -2044,11 +2044,11 @@ define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-LABEL: atomic_max_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -2111,11 +2111,11 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -2178,13 +2178,13 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] ; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -2252,14 +2252,14 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_max_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -2308,11 +2308,11 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in ; GFX12-LABEL: atomic_umax_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] offset:32 -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -2376,11 +2376,11 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -2446,13 +2446,13 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] ; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] offset:32 -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -2523,14 +2523,14 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -2584,11 +2584,11 @@ define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-LABEL: atomic_umax_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -2651,11 +2651,11 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -2718,13 +2718,13 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] ; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -2792,14 +2792,14 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_umax_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -2848,11 +2848,11 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-LABEL: atomic_min_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] offset:32 -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -2916,11 +2916,11 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -2986,13 +2986,13 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] ; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] offset:32 -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -3063,14 +3063,14 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX12-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -3124,11 +3124,11 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-LABEL: atomic_min_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -3191,11 +3191,11 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -3258,13 +3258,13 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] ; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -3332,14 +3332,14 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_min_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -3388,11 +3388,11 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in ; GFX12-LABEL: atomic_umin_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] offset:32 -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -3456,11 +3456,11 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -3526,13 +3526,13 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out, ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] ; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] offset:32 -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -3603,14 +3603,14 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o ; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -3664,11 +3664,11 @@ define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-LABEL: atomic_umin_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -3731,11 +3731,11 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -3798,13 +3798,13 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] ; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: @@ -3872,14 +3872,14 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_umin_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -3934,11 +3934,11 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-LABEL: atomic_or_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] offset:32 -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -4005,11 +4005,11 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr a ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -4081,13 +4081,13 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i6 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] ; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] offset:32 -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -4161,14 +4161,14 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out ; GFX12-LABEL: atomic_or_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -4228,11 +4228,11 @@ define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-LABEL: atomic_or_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -4298,11 +4298,11 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspac ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -4371,13 +4371,13 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] ; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -4448,14 +4448,14 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a ; GFX12-LABEL: atomic_or_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -4510,11 +4510,11 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in ; GFX12-LABEL: atomic_xchg_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32 -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -4565,11 +4565,11 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double ; GFX12-LABEL: atomic_xchg_f64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32 -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: @@ -4620,11 +4620,11 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_xchg_pointer_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32 -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: @@ -4691,11 +4691,11 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -4767,13 +4767,13 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out, ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] ; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32 -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -4847,14 +4847,14 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o ; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -4914,11 +4914,11 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-LABEL: atomic_xchg_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -4984,11 +4984,11 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -5057,13 +5057,13 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] ; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -5134,14 +5134,14 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_xchg_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -5196,11 +5196,11 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-LABEL: atomic_xor_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] offset:32 -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -5267,11 +5267,11 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -5343,13 +5343,13 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] ; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] offset:32 -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -5423,14 +5423,14 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -5490,11 +5490,11 @@ define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-LABEL: atomic_xor_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -5560,11 +5560,11 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -5633,13 +5633,13 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] ; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -5710,14 +5710,14 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_xor_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -5787,12 +5787,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr addrspace(1) %out, i64 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] offset:32 -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -5860,12 +5860,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr addrspace(1) %out, i64 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] offset:72000 -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -5933,12 +5933,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out, ; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v4, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -6012,7 +6012,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou ; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: v_mov_b32_e32 v2, s6 @@ -6020,7 +6020,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] ; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[0:1] offset:32 -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -6105,7 +6105,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9 ; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s0 @@ -6113,7 +6113,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] ; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v4, v[0:1], s[6:7] ; GFX12-NEXT: s_nop 0 @@ -6185,12 +6185,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr addrspace(1) %out, i64 %in, i6 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -6257,12 +6257,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr add ; GFX12-LABEL: atomic_cmpxchg_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v4, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -6333,7 +6333,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64 ; GFX12-LABEL: atomic_cmpxchg_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: v_mov_b32_e32 v2, s6 @@ -6341,7 +6341,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] ; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[0:1] -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -6423,7 +6423,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out, ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9 ; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s0 @@ -6431,7 +6431,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out, ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] ; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v4, v[0:1], s[6:7] ; GFX12-NEXT: s_nop 0 @@ -6495,9 +6495,9 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addr ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -6562,9 +6562,9 @@ define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:-32 th:TH_LOAD_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -6625,9 +6625,9 @@ define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1 ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] th:TH_LOAD_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -6703,12 +6703,12 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -6784,12 +6784,12 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addr ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] th:TH_LOAD_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -6866,12 +6866,12 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -6926,7 +6926,7 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr addrspace(1) %ou ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] offset:32 ; GFX12-NEXT: s_nop 0 @@ -6977,7 +6977,7 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr addrspace(1) %out) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -7042,7 +7042,7 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -7110,7 +7110,7 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr addrspace(1) %ou ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -7179,7 +7179,7 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrsp ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -7237,11 +7237,11 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-LABEL: atomic_inc_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -7308,11 +7308,11 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -7384,13 +7384,13 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] ; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -7442,11 +7442,11 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-LABEL: atomic_dec_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32 -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: @@ -7513,11 +7513,11 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: global_atomic_dec_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 @@ -7589,13 +7589,13 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] ; GFX12-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32 -; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll index 6a6c5b33e0dd8f..762722ec3d9230 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @global_atomic_ordered_add_b64_no_rtn(ptr addrspace(1) ; GFX12-SDAG-LABEL: global_atomic_ordered_add_b64_no_rtn: ; GFX12-SDAG: ; %bb.0: ; %entry ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-SDAG-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[0:1] offset:-32 th:TH_ATOMIC_RETURN @@ -18,7 +18,7 @@ define amdgpu_kernel void @global_atomic_ordered_add_b64_no_rtn(ptr addrspace(1) ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[0:1] offset:-32 th:TH_ATOMIC_RETURN ; GFX12-GISEL-NEXT: s_endpgm @@ -34,10 +34,10 @@ define amdgpu_kernel void @global_atomic_ordered_add_b64_rtn(ptr addrspace(1) %a ; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 ; GFX12-SDAG-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -49,10 +49,10 @@ define amdgpu_kernel void @global_atomic_ordered_add_b64_rtn(ptr addrspace(1) %a ; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-GISEL-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll index c76f22d7763923..e96d6875558985 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll @@ -20,7 +20,7 @@ define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) ; GFX12-LABEL: load_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -44,7 +44,7 @@ define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) ; GFX12-LABEL: load_2d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -69,7 +69,7 @@ define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l ; GFX12-LABEL: load_3d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -95,7 +95,7 @@ define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords ; GFX12-LABEL: load_cube: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -121,7 +121,7 @@ define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coo ; GFX12-LABEL: load_1darray: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -146,7 +146,7 @@ define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coo ; GFX12-LABEL: load_2darray: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -172,7 +172,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX12-LABEL: load_2dmsaa: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -198,7 +198,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16> ; GFX12-LABEL: load_2darraymsaa: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -225,7 +225,7 @@ define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX12-LABEL: load_mip_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load_mip v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -250,7 +250,7 @@ define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX12-LABEL: load_mip_2d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load_mip v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -276,7 +276,7 @@ define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX12-LABEL: load_mip_3d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load_mip v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -303,7 +303,7 @@ define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX12-LABEL: load_mip_cube: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load_mip v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -330,7 +330,7 @@ define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, <2 x i16> ; GFX12-LABEL: load_mip_1darray: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load_mip v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -356,7 +356,7 @@ define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, <2 x i16> ; GFX12-LABEL: load_mip_2darray: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load_mip v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -830,7 +830,7 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX12-LABEL: getresinfo_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -854,7 +854,7 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX12-LABEL: getresinfo_2d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -878,7 +878,7 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX12-LABEL: getresinfo_3d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -902,7 +902,7 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, <2 x i16> % ; GFX12-LABEL: getresinfo_cube: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -926,7 +926,7 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, <2 x i16 ; GFX12-LABEL: getresinfo_1darray: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -950,7 +950,7 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, <2 x i16 ; GFX12-LABEL: getresinfo_2darray: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -974,7 +974,7 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> ; GFX12-LABEL: getresinfo_2dmsaa: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -998,7 +998,7 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, <2 x ; GFX12-LABEL: getresinfo_2darraymsaa: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -1022,7 +1022,7 @@ define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, <2 x i16> %coords) { ; GFX12-LABEL: load_1d_V1: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v0, v0, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -1046,7 +1046,7 @@ define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, <2 x i16> %coord ; GFX12-LABEL: load_1d_V2: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x9 dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -1130,7 +1130,7 @@ define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX12-LABEL: load_1d_glc: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_LOAD_NT a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -1154,7 +1154,7 @@ define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX12-LABEL: load_1d_slc: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_LOAD_HT a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -1178,7 +1178,7 @@ define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, <2 x i16> % ; GFX12-LABEL: load_1d_glc_slc: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_LOAD_LU a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll index 493653b6583089..276b506fb4a230 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll @@ -26,7 +26,7 @@ define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) ; GFX12-LABEL: load_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x40,0x00,0xc0,0xd3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -56,7 +56,7 @@ define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) ; GFX12-LABEL: load_2d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; encoding: [0x41,0x00,0xc0,0xd3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -87,7 +87,7 @@ define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l ; GFX12-LABEL: load_3d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x42,0x00,0xc0,0xd3,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -119,7 +119,7 @@ define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords ; GFX12-LABEL: load_cube: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16 ; encoding: [0x43,0x00,0xc0,0xd3,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -151,7 +151,7 @@ define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coo ; GFX12-LABEL: load_1darray: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16 ; encoding: [0x44,0x00,0xc0,0xd3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -182,7 +182,7 @@ define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coo ; GFX12-LABEL: load_2darray: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16 ; encoding: [0x45,0x00,0xc0,0xd3,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -214,7 +214,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX12-LABEL: load_2dmsaa: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA a16 ; encoding: [0x46,0x00,0xc0,0xd3,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -246,7 +246,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16> ; GFX12-LABEL: load_2darraymsaa: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY a16 ; encoding: [0x47,0x00,0xc0,0xd3,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -279,7 +279,7 @@ define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX12-LABEL: load_mip_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load_mip v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x40,0x40,0xc0,0xd3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -310,7 +310,7 @@ define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX12-LABEL: load_mip_2d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load_mip v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; encoding: [0x41,0x40,0xc0,0xd3,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -342,7 +342,7 @@ define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX12-LABEL: load_mip_3d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load_mip v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x42,0x40,0xc0,0xd3,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -375,7 +375,7 @@ define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX12-LABEL: load_mip_cube: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load_mip v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16 ; encoding: [0x43,0x40,0xc0,0xd3,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -408,7 +408,7 @@ define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, <2 x i16> ; GFX12-LABEL: load_mip_1darray: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load_mip v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16 ; encoding: [0x44,0x40,0xc0,0xd3,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -440,7 +440,7 @@ define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, <2 x i16> ; GFX12-LABEL: load_mip_2darray: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load_mip v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16 ; encoding: [0x45,0x40,0xc0,0xd3,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -920,7 +920,7 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX12-LABEL: getresinfo_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x40,0xc0,0xc5,0xd3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -950,7 +950,7 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX12-LABEL: getresinfo_2d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; encoding: [0x41,0xc0,0xc5,0xd3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -980,7 +980,7 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX12-LABEL: getresinfo_3d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x42,0xc0,0xc5,0xd3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -1010,7 +1010,7 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, <2 x i16> % ; GFX12-LABEL: getresinfo_cube: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16 ; encoding: [0x43,0xc0,0xc5,0xd3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -1040,7 +1040,7 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, <2 x i16 ; GFX12-LABEL: getresinfo_1darray: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16 ; encoding: [0x44,0xc0,0xc5,0xd3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -1070,7 +1070,7 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, <2 x i16 ; GFX12-LABEL: getresinfo_2darray: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16 ; encoding: [0x45,0xc0,0xc5,0xd3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -1100,7 +1100,7 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> ; GFX12-LABEL: getresinfo_2dmsaa: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA a16 ; encoding: [0x46,0xc0,0xc5,0xd3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -1130,7 +1130,7 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, <2 x ; GFX12-LABEL: getresinfo_2darraymsaa: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY a16 ; encoding: [0x47,0xc0,0xc5,0xd3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -1160,7 +1160,7 @@ define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, <2 x i16> %coords) { ; GFX12-LABEL: load_1d_V1: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v0, v0, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x40,0x00,0x00,0xd2,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -1190,7 +1190,7 @@ define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, <2 x i16> %coord ; GFX12-LABEL: load_1d_V2: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x9 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x40,0x00,0x40,0xd2,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -1280,7 +1280,7 @@ define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX12-LABEL: load_1d_glc: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_LOAD_NT a16 ; encoding: [0x40,0x00,0xc0,0xd3,0x00,0x00,0x10,0x00,0x00,0x00,0x00,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -1310,7 +1310,7 @@ define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX12-LABEL: load_1d_slc: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_LOAD_HT a16 ; encoding: [0x40,0x00,0xc0,0xd3,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -1340,7 +1340,7 @@ define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, <2 x i16> % ; GFX12-LABEL: load_1d_glc_slc: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_LOAD_LU a16 ; encoding: [0x40,0x00,0xc0,0xd3,0x00,0x00,0x30,0x00,0x00,0x00,0x00,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll index afa9696b17110b..5b5586f86f0d73 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll @@ -41,7 +41,7 @@ define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, i32 %s) { ; GFX12-LABEL: load_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -152,7 +152,7 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1 ; GFX12-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 ; GFX12-NEXT: v_mov_b32_e32 v4, v10 ; GFX12-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v6, v4, s[8:9] ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -262,7 +262,7 @@ define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1 ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:4], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX12-NEXT: v_mov_b32_e32 v5, 0 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v5, v4, s[8:9] ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -307,7 +307,7 @@ define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { ; GFX12-LABEL: load_2d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -423,7 +423,7 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1 ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: v_mov_b32_e32 v4, v11 ; GFX12-NEXT: image_load v[0:4], [v6, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v7, v4, s[8:9] ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -468,7 +468,7 @@ define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 ; GFX12-LABEL: load_3d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], [v0, v1, v2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0) @@ -589,7 +589,7 @@ define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, ptr addrspa ; GFX12-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 ; GFX12-NEXT: v_mov_b32_e32 v4, v12 ; GFX12-NEXT: image_load v[0:4], [v7, v6, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v8, v4, s[8:9] ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -634,7 +634,7 @@ define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i ; GFX12-LABEL: load_cube: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], [v0, v1, v2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -749,7 +749,7 @@ define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, ptr addrspace ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:4], [v0, v1, v2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE ; GFX12-NEXT: v_mov_b32_e32 v5, 0 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v5, v4, s[8:9] ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -794,7 +794,7 @@ define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %s ; GFX12-LABEL: load_1darray: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -910,7 +910,7 @@ define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, ptr addrsp ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: v_mov_b32_e32 v4, v11 ; GFX12-NEXT: image_load v[0:4], [v6, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v7, v4, s[8:9] ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -955,7 +955,7 @@ define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t ; GFX12-LABEL: load_2darray: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], [v0, v1, v2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -1070,7 +1070,7 @@ define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, ptr addrsp ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:4], [v0, v1, v2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY ; GFX12-NEXT: v_mov_b32_e32 v5, 0 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v5, v4, s[8:9] ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -1115,7 +1115,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ; GFX12-LABEL: load_2dmsaa: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], [v0, v1, v2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -1236,7 +1236,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, ptr addrsp ; GFX12-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 ; GFX12-NEXT: v_mov_b32_e32 v4, v12 ; GFX12-NEXT: image_load v[0:4], [v7, v6, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v8, v4, s[8:9] ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -1281,7 +1281,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i3 ; GFX12-LABEL: load_2darraymsaa: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], [v0, v1, v2, v3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -1407,7 +1407,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr ad ; GFX12-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 ; GFX12-NEXT: v_mov_b32_e32 v4, v13 ; GFX12-NEXT: image_load v[0:4], [v8, v7, v6, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v9, v4, s[8:9] ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -1452,7 +1452,7 @@ define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, i32 %s, i32 %mi ; GFX12-LABEL: load_mip_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load_mip v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 15, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -1562,7 +1562,7 @@ define amdgpu_ps <4 x float> @load_mip_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspa ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load_mip v[0:4], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX12-NEXT: v_mov_b32_e32 v5, 0 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v5, v4, s[8:9] ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -1607,7 +1607,7 @@ define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ; GFX12-LABEL: load_mip_2d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load_mip v[0:3], [v0, v1, v2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -1728,7 +1728,7 @@ define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, ptr addrspa ; GFX12-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 ; GFX12-NEXT: v_mov_b32_e32 v4, v12 ; GFX12-NEXT: image_load_mip v[0:4], [v7, v6, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v8, v4, s[8:9] ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -1789,7 +1789,7 @@ define amdgpu_ps float @load_1d_V2_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) { ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -1849,7 +1849,7 @@ define amdgpu_ps float @load_1d_V1_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) { ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -1909,7 +1909,7 @@ define amdgpu_ps float @load_mip_2d_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s, i3 ; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: image_load_mip v[3:4], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -1969,7 +1969,7 @@ define amdgpu_ps float @load_mip_2d_tfe_nouse(<8 x i32> inreg %rsrc, i32 %s, i32 ; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: image_load_mip v[3:4], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -2029,7 +2029,7 @@ define amdgpu_ps float @load_mip_2d_tfe_nouse_V2(<8 x i32> inreg %rsrc, i32 %s, ; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: image_load_mip v[3:4], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -2089,7 +2089,7 @@ define amdgpu_ps float @load_mip_2d_tfe_nouse_V1(<8 x i32> inreg %rsrc, i32 %s, ; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: image_load_mip v[3:4], [v0, v1, v2], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -2194,7 +2194,7 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask3(<8 x i32> inreg %rsrc, ptr a ; GFX12-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 ; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 ; GFX12-NEXT: image_load v[0:3], v4, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v5, v3, s[8:9] ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -2292,7 +2292,7 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask2(<8 x i32> inreg %rsrc, ptr a ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: v_mov_b32_e32 v2, v6 ; GFX12-NEXT: image_load v[0:2], v3, s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_1D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v4, v2, s[8:9] ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -2381,7 +2381,7 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask1(<8 x i32> inreg %rsrc, ptr a ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 ; GFX12-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v3, v1, s[8:9] ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -2470,7 +2470,7 @@ define amdgpu_ps <2 x float> @load_1d_tfe_V2_dmask1(<8 x i32> inreg %rsrc, ptr a ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 ; GFX12-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v3, v1, s[8:9] ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -2516,7 +2516,7 @@ define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ; GFX12-LABEL: load_mip_3d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load_mip v[0:3], [v0, v1, v2, v3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %r, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -2557,7 +2557,7 @@ define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, i32 %s, i32 % ; GFX12-LABEL: load_mip_cube: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load_mip v[0:3], [v0, v1, v2, v3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -2598,7 +2598,7 @@ define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, i32 %s, i3 ; GFX12-LABEL: load_mip_1darray: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load_mip v[0:3], [v0, v1, v2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32(i32 15, i32 %s, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -2639,7 +2639,7 @@ define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, i32 %s, i3 ; GFX12-LABEL: load_mip_2darray: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load_mip v[0:3], [v0, v1, v2, v3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.2darray.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -3296,7 +3296,7 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, i32 %mip) { ; GFX12-LABEL: getresinfo_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -3337,7 +3337,7 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, i32 %mip) { ; GFX12-LABEL: getresinfo_2d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -3378,7 +3378,7 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, i32 %mip) { ; GFX12-LABEL: getresinfo_3d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.3d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -3419,7 +3419,7 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, i32 %mip) { ; GFX12-LABEL: getresinfo_cube: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.cube.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -3460,7 +3460,7 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, i32 %mip ; GFX12-LABEL: getresinfo_1darray: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1darray.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -3501,7 +3501,7 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, i32 %mip ; GFX12-LABEL: getresinfo_2darray: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darray.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -3542,7 +3542,7 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, i32 %mip) ; GFX12-LABEL: getresinfo_2dmsaa: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2dmsaa.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -3583,7 +3583,7 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, i32 ; GFX12-LABEL: getresinfo_2darraymsaa: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darraymsaa.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -3624,7 +3624,7 @@ define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, i32 %s) { ; GFX12-LABEL: load_1d_V1: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v0, v0, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call float @llvm.amdgcn.image.load.1d.f32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -3665,7 +3665,7 @@ define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, i32 %s) { ; GFX12-LABEL: load_1d_V2: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x9 dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 9, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -3794,7 +3794,7 @@ define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, i32 %s) { ; GFX12-LABEL: load_1d_glc: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_LOAD_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1) @@ -3835,7 +3835,7 @@ define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, i32 %s) { ; GFX12-LABEL: load_1d_slc: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_LOAD_HT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 2) @@ -3876,7 +3876,7 @@ define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, i32 %s) { ; GFX12-LABEL: load_1d_glc_slc: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_LOAD_LU -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 3) @@ -4049,7 +4049,7 @@ define amdgpu_ps <3 x float> @getresinfo_dmask7(<8 x i32> inreg %rsrc, <4 x floa ; GFX12-LABEL: getresinfo_dmask7: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_get_resinfo v[0:2], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %r = call <3 x float> @llvm.amdgcn.image.getresinfo.1d.v3f32.i32(i32 7, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -4090,7 +4090,7 @@ define amdgpu_ps <2 x float> @getresinfo_dmask3(<8 x i32> inreg %rsrc, <4 x floa ; GFX12-LABEL: getresinfo_dmask3: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_get_resinfo v[0:1], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %r = call <2 x float> @llvm.amdgcn.image.getresinfo.1d.v2f32.i32(i32 3, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -4131,7 +4131,7 @@ define amdgpu_ps float @getresinfo_dmask1(<8 x i32> inreg %rsrc, <4 x float> %vd ; GFX12-LABEL: getresinfo_dmask1: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_get_resinfo v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %r = call float @llvm.amdgcn.image.getresinfo.1d.f32.i32(i32 1, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -4224,7 +4224,7 @@ define amdgpu_ps void @image_store_wait(<8 x i32> inreg %arg, <8 x i32> inreg %a ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX12-NEXT: image_load v[0:3], v4, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: image_store v[0:3], v4, s[16:23] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -4306,9 +4306,9 @@ define amdgpu_ps float @image_load_mmo(<8 x i32> inreg %rsrc, ptr addrspace(3) % ; GFX12-NEXT: image_load v1, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: ds_store_2addr_b32 v0, v2, v2 offset1:4 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: ; return to shader part epilog store float 0.000000e+00, ptr addrspace(3) %lds %c0 = extractelement <2 x i32> %c, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll index bee9cafffa2f8c..3210565c96abb0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll @@ -33,7 +33,7 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f16(i32 1, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -69,7 +69,7 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.cube.v4f32.f16(i32 1, half %s, half %t, half %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -105,7 +105,7 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.2darray.v4f32.f16(i32 1, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -141,7 +141,7 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_gather4_c v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f32(i32 1, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -177,7 +177,7 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_gather4_cl v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f16(i32 1, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -215,7 +215,7 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX12-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_gather4_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.2d.v4f32.f32(i32 1, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -251,7 +251,7 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_gather4_b v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f16.f16(i32 1, half %bias, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -287,7 +287,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_gather4_c_b v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f16.f16(i32 1, half %bias, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -325,7 +325,7 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX12-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_gather4_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f16.f16(i32 1, half %bias, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -364,7 +364,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_gather4_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f16.f16(i32 1, half %bias, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -391,7 +391,7 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX12-NEXT: image_gather4_l v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f16(i32 1, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -420,7 +420,7 @@ define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX12-NEXT: image_gather4_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f32(i32 1, float %zcompare, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -447,7 +447,7 @@ define amdgpu_ps <4 x float> @gather4_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX12-NEXT: image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 1, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -474,7 +474,7 @@ define amdgpu_ps <4 x float> @gather4_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX12-NEXT: image_gather4_c_lz v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32(i32 1, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.dim.ll index d0ee235bcf7b34..3a0762f1c4f77b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.dim.ll @@ -1,14 +1,15 @@ -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefixes=GCN,PRE-GFX10 %s -; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefixes=GCN,PRE-GFX10 %s -; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck --check-prefixes=GCN,GFX10 %s -; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefixes=GCN,GFX10 %s +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefixes=GCN,PRE-GFX10,PRE-GFX12 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefixes=GCN,PRE-GFX10,PRE-GFX12 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck --check-prefixes=GCN,GFX10,PRE-GFX12 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefixes=GCN,GFX10,PRE-GFX12 %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck --check-prefixes=GCN,GFX12 %s ; GCN-LABEL: {{^}}getlod_1d: ; PRE-GFX10: image_get_lod v[0:3], v0, s[0:7], s[8:11] dmask:0xf{{$}} ; GFX10: image_get_lod v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX12: image_get_lod v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GCN: s_waitcnt vmcnt(0) +; PRE-GFX12: s_waitcnt vmcnt(0) +; GFX12: s_wait_samplecnt 0x0 define amdgpu_ps <4 x float> @getlod_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { main_body: %r = call <4 x float> @llvm.amdgcn.image.getlod.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -19,7 +20,8 @@ main_body: ; PRE-GFX10: image_get_lod v[0:1], v[0:1], s[0:7], s[8:11] dmask:0x3{{$}} ; GFX10: image_get_lod v[0:1], v[0:1], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_2D ; GFX12: image_get_lod v[0:1], [v0, v1], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_2D -; GCN: s_waitcnt vmcnt(0) +; PRE-GFX12: s_waitcnt vmcnt(0) +; GFX12: s_wait_samplecnt 0x0 define amdgpu_ps <2 x float> @getlod_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { main_body: %r = call <2 x float> @llvm.amdgcn.image.getlod.2d.v2f32.f32(i32 3, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll index 863dd357de43d4..9b5ae45688c210 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll @@ -12,7 +12,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ; GFX12-LABEL: load_2dmsaa: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_msaa_load v[0:3], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm ; encoding: [0x06,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -32,7 +32,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, ptr addrsp ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_msaa_load v[0:4], [v0, v1, v2], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x0e,0x20,0x86,0xe4,0x00,0x01,0x00,0x00,0x00,0x01,0x02,0x00] ; GFX12-NEXT: v_mov_b32_e32 v5, 0 ; encoding: [0x80,0x02,0x0a,0x7e] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: global_store_b32 v5, v4, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x02,0x05,0x00,0x00,0x00] ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -53,7 +53,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i3 ; GFX12-LABEL: load_2darraymsaa: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_msaa_load v[0:3], [v0, v1, v2, v3], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm ; encoding: [0x07,0x20,0x06,0xe5,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x03] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i32(i32 4, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -73,7 +73,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr ad ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_msaa_load v[0:4], [v0, v1, v2, v3], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x0f,0x20,0x06,0xe6,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x03] ; GFX12-NEXT: v_mov_b32_e32 v5, 0 ; encoding: [0x80,0x02,0x0a,0x7e] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: global_store_b32 v5, v4, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x02,0x05,0x00,0x00,0x00] ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -94,7 +94,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa_glc(<8 x i32> inreg %rsrc, i32 %s, i32 ; GFX12-LABEL: load_2dmsaa_glc: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_msaa_load v[0:3], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm th:TH_LOAD_NT ; encoding: [0x06,0x20,0x46,0xe4,0x00,0x00,0x10,0x00,0x00,0x01,0x02,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 1) @@ -111,7 +111,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa_slc(<8 x i32> inreg %rsrc, i32 %s, i32 ; GFX12-LABEL: load_2dmsaa_slc: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_msaa_load v[0:3], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm th:TH_LOAD_HT ; encoding: [0x06,0x20,0x46,0xe4,0x00,0x00,0x20,0x00,0x00,0x01,0x02,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 2) @@ -128,7 +128,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa_glc_slc(<8 x i32> inreg %rsrc, i32 %s, ; GFX12-LABEL: load_2dmsaa_glc_slc: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_msaa_load v[0:3], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm th:TH_LOAD_LU ; encoding: [0x06,0x20,0x46,0xe4,0x00,0x00,0x30,0x00,0x00,0x01,0x02,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 3) @@ -145,7 +145,7 @@ define amdgpu_ps <4 x half> @load_2dmsaa_d16(<8 x i32> inreg %rsrc, i32 %s, i32 ; GFX12-LABEL: load_2dmsaa_d16: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_msaa_load v[0:1], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm d16 ; encoding: [0x26,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x half> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f16.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -165,7 +165,7 @@ define amdgpu_ps <4 x half> @load_2dmsaa_tfe_d16(<8 x i32> inreg %rsrc, ptr addr ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_msaa_load v[0:2], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe d16 ; encoding: [0x2e,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x00] ; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: global_store_b32 v3, v2, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x03,0x00,0x00,0x00] ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -186,7 +186,7 @@ define amdgpu_ps <4 x half> @load_2darraymsaa_d16(<8 x i32> inreg %rsrc, i32 %s, ; GFX12-LABEL: load_2darraymsaa_d16: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_msaa_load v[0:1], [v0, v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm d16 ; encoding: [0x27,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x03] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x half> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f16.i32(i32 1, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -206,7 +206,7 @@ define amdgpu_ps <4 x half> @load_2darraymsaa_tfe_d16(<8 x i32> inreg %rsrc, ptr ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_msaa_load v[0:2], [v0, v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe d16 ; encoding: [0x2f,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x03] ; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: global_store_b32 v3, v2, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x03,0x00,0x00,0x00] ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -229,7 +229,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa_a16(<8 x i32> inreg %rsrc, i16 %s, i16 ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX12-NEXT: image_msaa_load v[0:3], [v0, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; encoding: [0x46,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x02,0x00,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i16(i32 1, i16 %s, i16 %t, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -250,7 +250,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_a16(<8 x i32> inreg %rsrc, i16 %s ; GFX12-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX12-NEXT: image_msaa_load v[0:3], [v0, v2], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 ; encoding: [0x47,0x20,0x06,0xe5,0x00,0x00,0x00,0x00,0x00,0x02,0x00,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i16(i32 4, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll index b0c2f7ac818451..4dfa3aa3aa7c53 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll @@ -38,7 +38,7 @@ define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg % ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -84,7 +84,7 @@ define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg % ; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -130,7 +130,7 @@ define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg % ; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f16(i32 15, half %s, half %t, half %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -176,7 +176,7 @@ define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f16(i32 15, half %s, half %t, half %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -222,7 +222,7 @@ define amdgpu_ps <4 x float> @sample_1darray(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f16(i32 15, half %s, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -268,7 +268,7 @@ define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f16(i32 15, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -309,7 +309,7 @@ define amdgpu_ps <4 x float> @sample_c_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_c v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f16(i32 15, float %zcompare, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -355,7 +355,7 @@ define amdgpu_ps <4 x float> @sample_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_c v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -401,7 +401,7 @@ define amdgpu_ps <4 x float> @sample_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_cl v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f16(i32 15, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -447,7 +447,7 @@ define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_cl v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f16(i32 15, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -493,7 +493,7 @@ define amdgpu_ps <4 x float> @sample_c_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_c_cl v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f16(i32 15, float %zcompare, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -541,7 +541,7 @@ define amdgpu_ps <4 x float> @sample_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -582,7 +582,7 @@ define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_b v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f16.f16(i32 15, half %bias, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -628,7 +628,7 @@ define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_b v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f16.f16(i32 15, half %bias, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -669,7 +669,7 @@ define amdgpu_ps <4 x float> @sample_c_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_c_b v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -715,7 +715,7 @@ define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_c_b v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -761,7 +761,7 @@ define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_b_cl v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f16.f16(i32 15, half %bias, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -809,7 +809,7 @@ define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f16.f16(i32 15, half %bias, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -855,7 +855,7 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_c_b_cl v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -904,7 +904,7 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -933,7 +933,7 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-LABEL: sample_d_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -975,7 +975,7 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX12-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1025,7 +1025,7 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX12-NEXT: v_perm_b32 v9, v7, v6, 0x5040100 ; GFX12-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v3, v[8:10]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, half %s, half %t, half %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1054,7 +1054,7 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-LABEL: sample_c_d_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1098,7 +1098,7 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 ; GFX12-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX12-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1132,7 +1132,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX12-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1174,7 +1174,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX12-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1208,7 +1208,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 ; GFX12-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1252,7 +1252,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX12-NEXT: v_perm_b32 v6, v6, v5, 0x5040100 ; GFX12-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v[6:7]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1286,7 +1286,7 @@ define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX12-NEXT: image_sample_l v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f16(i32 15, half %s, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1320,7 +1320,7 @@ define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX12-NEXT: image_sample_l v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f16(i32 15, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1354,7 +1354,7 @@ define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX12-NEXT: image_sample_c_l v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f16(i32 15, float %zcompare, half %s, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1390,7 +1390,7 @@ define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX12-NEXT: image_sample_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1419,7 +1419,7 @@ define amdgpu_ps <4 x float> @sample_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12-LABEL: sample_lz_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f16(i32 15, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1453,7 +1453,7 @@ define amdgpu_ps <4 x float> @sample_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX12-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f16(i32 15, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1482,7 +1482,7 @@ define amdgpu_ps <4 x float> @sample_c_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-LABEL: sample_c_lz_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_c_lz v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f16(i32 15, float %zcompare, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1516,7 +1516,7 @@ define amdgpu_ps <4 x float> @sample_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX12-NEXT: image_sample_c_lz v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1564,7 +1564,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-NEXT: v_perm_b32 v6, v5, v4, 0x5040100 ; GFX12-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX12-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v[6:8]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f16.f16(i32 4, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1612,7 +1612,7 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 ; GFX12-NEXT: v_perm_b32 v6, v5, v4, 0x5040100 ; GFX12-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX12-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v[6:8]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f16(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll index 950db78d2d502d..d90c193514eb0e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll @@ -49,7 +49,7 @@ define amdgpu_ps half @image_sample_2d_f16(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v0, [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D d16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %tex = call half @llvm.amdgcn.image.sample.2d.f16.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -144,7 +144,7 @@ define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX12-NEXT: image_sample v[0:1], [v3, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: global_store_b32 v4, v1, s[12:13] ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -185,7 +185,7 @@ define amdgpu_ps float @image_sample_c_d_1d_v2f16(<8 x i32> inreg %rsrc, <4 x i3 ; GFX12-LABEL: image_sample_c_d_1d_v2f16: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_c_d v0, [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D d16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %tex = call <2 x half> @llvm.amdgcn.image.sample.c.d.1d.v2f16.f32.f32(i32 3, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -251,7 +251,7 @@ define amdgpu_ps <2 x float> @image_sample_c_d_1d_v2f16_tfe(<8 x i32> inreg %rsr ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_mov_b32_e32 v1, v0 ; GFX12-NEXT: image_sample_c_d v[0:1], [v5, v4, v2, v3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe d16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %tex = call {<2 x half>,i32} @llvm.amdgcn.image.sample.c.d.1d.v2f16i32.f32.f32(i32 3, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0) @@ -310,7 +310,7 @@ define amdgpu_ps <2 x float> @image_sample_b_2d_v3f16(<8 x i32> inreg %rsrc, <4 ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_b v[0:1], [v0, v1, v2], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D d16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %tex = call <3 x half> @llvm.amdgcn.image.sample.b.2d.v3f16.f32.f32(i32 7, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -404,7 +404,7 @@ define amdgpu_ps <4 x float> @image_sample_b_2d_v3f16_tfe(<8 x i32> inreg %rsrc, ; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_b v[0:2], [v5, v4, v3], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D tfe d16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %tex = call {<3 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v3f16i32.f32.f32(i32 7, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0) @@ -467,7 +467,7 @@ define amdgpu_ps <2 x float> @image_sample_b_2d_v4f16(<8 x i32> inreg %rsrc, <4 ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_b v[0:1], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D d16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %tex = call <4 x half> @llvm.amdgcn.image.sample.b.2d.v4f16.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -561,7 +561,7 @@ define amdgpu_ps <4 x float> @image_sample_b_2d_v4f16_tfe(<8 x i32> inreg %rsrc, ; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_b v[0:2], [v5, v4, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D tfe d16 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %tex = call {<4 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v4f16i32.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll index 86bf6f03bcfd1c..d0a1597c856c8b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll @@ -39,7 +39,7 @@ define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg % ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -139,7 +139,7 @@ define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-NEXT: v_mov_b32_e32 v4, v10 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX12-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: global_store_b32 v6, v4, s[12:13] ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -196,7 +196,7 @@ define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_1(<8 x i32> inreg % ; GFX12-NEXT: v_mov_b32_e32 v1, v0 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) @@ -255,7 +255,7 @@ define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_2(<8 x i32> inreg % ; GFX12-NEXT: v_mov_b32_e32 v1, v0 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x2 dim:SQ_RSRC_IMG_1D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) @@ -314,7 +314,7 @@ define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_3(<8 x i32> inreg % ; GFX12-NEXT: v_mov_b32_e32 v1, v0 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_1D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) @@ -373,7 +373,7 @@ define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_4(<8 x i32> inreg % ; GFX12-NEXT: v_mov_b32_e32 v1, v0 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x8 dim:SQ_RSRC_IMG_1D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) @@ -446,7 +446,7 @@ define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_12(<8 x i32> inreg ; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) @@ -521,7 +521,7 @@ define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_24(<8 x i32> inreg ; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) @@ -589,7 +589,7 @@ define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_134(<8 x i32> inreg ; GFX12-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v[0:3], v4, s[0:7], s[8:11] dmask:0xd dim:SQ_RSRC_IMG_1D tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) @@ -699,7 +699,7 @@ define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-NEXT: v_mov_b32_e32 v4, v10 ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX12-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D lwe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: global_store_b32 v6, v4, s[12:13] ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -744,7 +744,7 @@ define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg % ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -785,7 +785,7 @@ define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg % ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -826,7 +826,7 @@ define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32 15, float %s, float %t, float %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -867,7 +867,7 @@ define amdgpu_ps <4 x float> @sample_1darray(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 15, float %s, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -908,7 +908,7 @@ define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32 15, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -949,7 +949,7 @@ define amdgpu_ps <4 x float> @sample_c_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_c v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32 15, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -990,7 +990,7 @@ define amdgpu_ps <4 x float> @sample_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_c v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32(i32 15, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1031,7 +1031,7 @@ define amdgpu_ps <4 x float> @sample_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_cl v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f32(i32 15, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1072,7 +1072,7 @@ define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_cl v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f32(i32 15, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1113,7 +1113,7 @@ define amdgpu_ps <4 x float> @sample_c_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_c_cl v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32 15, float %zcompare, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1154,7 +1154,7 @@ define amdgpu_ps <4 x float> @sample_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_c_cl v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f32(i32 15, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1195,7 +1195,7 @@ define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_b v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float %bias, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1236,7 +1236,7 @@ define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_b v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1277,7 +1277,7 @@ define amdgpu_ps <4 x float> @sample_c_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_c_b v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1318,7 +1318,7 @@ define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_c_b v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1359,7 +1359,7 @@ define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_b_cl v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 15, float %bias, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1400,7 +1400,7 @@ define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_b_cl v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32(i32 15, float %bias, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1441,7 +1441,7 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_c_b_cl v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1482,7 +1482,7 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample_c_b_cl v[0:3], [v0, v1, v2, v[3:4]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1511,7 +1511,7 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-LABEL: sample_d_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_d v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1540,7 +1540,7 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-LABEL: sample_d_2d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_d v[0:3], [v0, v1, v2, v[3:5]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1569,7 +1569,7 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-LABEL: sample_c_d_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_c_d v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1598,7 +1598,7 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-LABEL: sample_c_d_2d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_c_d v[0:3], [v0, v1, v2, v[3:6]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1627,7 +1627,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-LABEL: sample_d_cl_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_d_cl v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1656,7 +1656,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-LABEL: sample_d_cl_2d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_d_cl v[0:3], [v0, v1, v2, v[3:6]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1685,7 +1685,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-LABEL: sample_c_d_cl_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_c_d_cl v[0:3], [v0, v1, v2, v[3:4]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1714,7 +1714,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-LABEL: sample_c_d_cl_2d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_c_d_cl v[0:3], [v0, v1, v2, v[3:7]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1743,7 +1743,7 @@ define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-LABEL: sample_l_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_l v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32(i32 15, float %s, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1772,7 +1772,7 @@ define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-LABEL: sample_l_2d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_l v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32 15, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1801,7 +1801,7 @@ define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-LABEL: sample_c_l_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_c_l v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32 15, float %zcompare, float %s, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1830,7 +1830,7 @@ define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-LABEL: sample_c_l_2d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_c_l v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f32(i32 15, float %zcompare, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1859,7 +1859,7 @@ define amdgpu_ps <4 x float> @sample_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12-LABEL: sample_lz_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1888,7 +1888,7 @@ define amdgpu_ps <4 x float> @sample_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12-LABEL: sample_lz_2d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_lz v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1917,7 +1917,7 @@ define amdgpu_ps <4 x float> @sample_c_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-LABEL: sample_c_lz_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_c_lz v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32 15, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1946,7 +1946,7 @@ define amdgpu_ps <4 x float> @sample_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-LABEL: sample_c_lz_2d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_c_lz v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32(i32 15, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1975,7 +1975,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-LABEL: sample_c_d_o_2darray_V1: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_c_d_o v0, [v0, v1, v2, v[3:8]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f32.f32(i32 4, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -2039,7 +2039,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x ; GFX12-NEXT: v_mov_b32_e32 v12, v11 ; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX12-NEXT: image_sample_c_d_o v[0:1], [v10, v9, v2, v[3:8]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: global_store_b32 v11, v1, s[12:13] ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -2072,7 +2072,7 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 ; GFX12-LABEL: sample_c_d_o_2darray_V2: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_c_d_o v[0:1], [v0, v1, v2, v[3:8]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -2132,7 +2132,7 @@ define amdgpu_ps <4 x float> @sample_c_d_o_2darray_V2_tfe(<8 x i32> inreg %rsrc, ; GFX12-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v10, v1 ; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: image_sample_c_d_o v[0:2], [v11, v10, v9, v[3:8]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call {<2 x float>, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32i32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) @@ -2181,7 +2181,7 @@ define amdgpu_ps <4 x float> @sample_1d_unorm(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D unorm -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 1, i32 0, i32 0) @@ -2222,7 +2222,7 @@ define amdgpu_ps <4 x float> @sample_1d_glc(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_LOAD_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 1) @@ -2263,7 +2263,7 @@ define amdgpu_ps <4 x float> @sample_1d_slc(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_LOAD_HT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 2) @@ -2304,7 +2304,7 @@ define amdgpu_ps <4 x float> @sample_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_LOAD_LU -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 3) @@ -2345,7 +2345,7 @@ define amdgpu_ps float @adjust_writemask_sample_0(<8 x i32> inreg %rsrc, <4 x i3 ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -2387,7 +2387,7 @@ define amdgpu_ps <2 x float> @adjust_writemask_sample_01(<8 x i32> inreg %rsrc, ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -2429,7 +2429,7 @@ define amdgpu_ps <3 x float> @adjust_writemask_sample_012(<8 x i32> inreg %rsrc, ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v[0:2], v0, s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -2471,7 +2471,7 @@ define amdgpu_ps <2 x float> @adjust_writemask_sample_12(<8 x i32> inreg %rsrc, ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -2513,7 +2513,7 @@ define amdgpu_ps <2 x float> @adjust_writemask_sample_03(<8 x i32> inreg %rsrc, ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x9 dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -2555,7 +2555,7 @@ define amdgpu_ps <2 x float> @adjust_writemask_sample_13(<8 x i32> inreg %rsrc, ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -2597,7 +2597,7 @@ define amdgpu_ps <3 x float> @adjust_writemask_sample_123(<8 x i32> inreg %rsrc, ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v[0:2], v0, s[0:7], s[8:11] dmask:0xe dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -2660,7 +2660,7 @@ define amdgpu_ps <2 x float> @adjust_writemask_sample_123_to_12(<8 x i32> inreg ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 14, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -2702,7 +2702,7 @@ define amdgpu_ps <2 x float> @adjust_writemask_sample_013_to_13(<8 x i32> inreg ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX12-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 11, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll index 134c736c6472f9..2ec7acb9934075 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll @@ -19,7 +19,7 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-LABEL: sample_d_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x40,0xce,0xe7,0x00,0x00,0x00,0x04,0x00,0x01,0x02,0x00] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -48,7 +48,7 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX12-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x01,0x40,0xce,0xe7,0x00,0x00,0x00,0x04,0x00,0x02,0x04,0x05] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -79,7 +79,7 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd6,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX12-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v3, v[5:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x02,0x40,0xce,0xe7,0x00,0x00,0x00,0x04,0x00,0x02,0x03,0x05] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -102,7 +102,7 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-LABEL: sample_c_d_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x80,0xce,0xe7,0x00,0x00,0x00,0x04,0x00,0x01,0x02,0x03] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -131,7 +131,7 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd6,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX12-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; encoding: [0x01,0x00,0x44,0xd6,0x02,0x03,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX12-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v[5:6]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x01,0x80,0xce,0xe7,0x00,0x00,0x00,0x04,0x00,0x01,0x03,0x05] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -154,7 +154,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-LABEL: sample_d_cl_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0xc0,0xd7,0xe7,0x00,0x00,0x00,0x04,0x00,0x01,0x02,0x03] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -183,7 +183,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX12-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v[5:6]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x01,0xc0,0xd7,0xe7,0x00,0x00,0x00,0x04,0x00,0x02,0x04,0x05] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -206,7 +206,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-LABEL: sample_c_d_cl_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v[3:4]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x00,0xd5,0xe7,0x00,0x00,0x00,0x04,0x00,0x01,0x02,0x03] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -237,7 +237,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd6,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX12-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; encoding: [0x01,0x00,0x44,0xd6,0x02,0x03,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX12-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v[5:7]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x01,0x00,0xd5,0xe7,0x00,0x00,0x00,0x04,0x00,0x01,0x03,0x05] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -270,7 +270,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-NEXT: v_perm_b32 v5, v5, v4, 0x5040100 ; encoding: [0x05,0x00,0x44,0xd6,0x05,0x09,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX12-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX12-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v[5:8]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x05,0x00,0x0f,0xe5,0x00,0x00,0x00,0x04,0x00,0x01,0x02,0x05] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32 4, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -303,7 +303,7 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 ; GFX12-NEXT: v_perm_b32 v5, v5, v4, 0x5040100 ; encoding: [0x05,0x00,0x44,0xd6,0x05,0x09,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX12-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX12-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v[5:8]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x05,0x00,0x8f,0xe5,0x00,0x00,0x00,0x04,0x00,0x01,0x02,0x05] -; GFX12-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX12-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll index e12dcc8795c08a..ba9e16f4320d52 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll @@ -19,7 +19,7 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-LABEL: sample_d_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -48,7 +48,7 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX12-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -79,7 +79,7 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 ; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX12-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v3, v[5:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -102,7 +102,7 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-LABEL: sample_c_d_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -131,7 +131,7 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 ; GFX12-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX12-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v[5:6]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -154,7 +154,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-LABEL: sample_d_cl_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -183,7 +183,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX12-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v[5:6]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -206,7 +206,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-LABEL: sample_c_d_cl_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v[3:4]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -237,7 +237,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 ; GFX12-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX12-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v[5:7]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -270,7 +270,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-NEXT: v_perm_b32 v5, v5, v4, 0x5040100 ; GFX12-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX12-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v[5:8]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32 4, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -303,7 +303,7 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 ; GFX12-NEXT: v_perm_b32 v5, v5, v4, 0x5040100 ; GFX12-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX12-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v[5:8]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll index 195c5dabb4d461..78b299d6dc1d77 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll @@ -12,12 +12,16 @@ ; GFX12: ds_direct_load v{{[0-9]+}} ; GCN: s_mov_b32 m0 ; GFX11: lds_direct_load v{{[0-9]+}} -; GCN: s_waitcnt expcnt(2) +; GFX11: s_waitcnt expcnt(2) +; GFX12: ds_direct_load v{{[0-9]+}} +; GFX12: s_wait_expcnt 0x2 ; GCN: v_add_f32 ; GCN: buffer_store_b32 -; GCN: s_waitcnt expcnt(1) +; GFX11: s_waitcnt expcnt(1) +; GFX12: s_wait_expcnt 0x1 ; GCN: buffer_store_b32 -; GCN: s_waitcnt expcnt(0) +; GFX11: s_waitcnt expcnt(0) +; GFX12: s_wait_expcnt 0x0 ; GCN: buffer_store_b32 ; GCN: buffer_store_b32 ; GCN: buffer_store_b32 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll index 1ab753d75fe031..3d3db711badee5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll @@ -15,16 +15,20 @@ ; GFX12-DAG: ds_param_load v{{[0-9]+}}, attr0.z ; GFX12-DAG: ds_param_load v{{[0-9]+}}, attr0.w ; GFX12-DAG: ds_param_load v{{[0-9]+}}, attr1.x -; GCN: s_waitcnt expcnt(4) +; GFX11-DAG: s_waitcnt expcnt(4) ; GCN: v_add_f32 ; GCN: buffer_store_b32 -; GCN: s_waitcnt expcnt(3) +; GFX11-DAG: s_waitcnt expcnt(3) +; GFX12-DAG: s_wait_expcnt 0x3 ; GCN: buffer_store_b32 -; GCN: s_waitcnt expcnt(2) +; GFX11-DAG: s_waitcnt expcnt(2) +; GFX12-DAG: s_wait_expcnt 0x2 ; GCN: buffer_store_b32 -; GCN: s_waitcnt expcnt(1) +; GFX11-DAG: s_waitcnt expcnt(1) +; GFX12-DAG: s_wait_expcnt 0x1 ; GCN: buffer_store_b32 -; GCN: s_waitcnt expcnt(0) +; GFX11-DAG: s_waitcnt expcnt(0) +; GFX12-DAG: s_wait_expcnt 0x0 ; GCN: buffer_store_b32 ; GCN: buffer_store_b32 define amdgpu_ps void @lds_param_load(ptr addrspace(8) inreg %buf, i32 inreg %arg) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll index d630ba946dca34..ac85313823d9c3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll @@ -43,7 +43,7 @@ define amdgpu_kernel void @v_permlane16_b32_vss(ptr addrspace(1) %out, i32 %src0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 @@ -86,7 +86,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii(ptr addrspace(1) %out, i32 %src0 ; GFX12-LABEL: v_permlane16_b32_vii: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v0, v0, 1, 2 @@ -132,7 +132,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll(ptr addrspace(1) %out, i32 %src0 ; GFX12-LABEL: v_permlane16_b32_vll: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_movk_i32 s2, 0x1234 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -208,7 +208,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 @@ -226,7 +226,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -295,7 +295,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0 ; GFX12-SDAG-LABEL: v_permlane16_b32_vvs: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 @@ -310,7 +310,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0 ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 @@ -381,7 +381,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv(ptr addrspace(1) %out, i32 %src0 ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 @@ -399,7 +399,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv(ptr addrspace(1) %out, i32 %src0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] @@ -444,7 +444,7 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi(ptr addrspace(1) %out, i32 %s ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] @@ -489,7 +489,7 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc(ptr addrspace(1) %out, i32 %s ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] @@ -534,7 +534,7 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc(ptr addrspace(1) %out, i32 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] @@ -579,7 +579,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vss(ptr addrspace(1) %out, i32 %src ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 @@ -622,7 +622,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii(ptr addrspace(1) %out, i32 %src ; GFX12-LABEL: v_permlanex16_b32_vii: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, 1, 2 @@ -668,7 +668,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll(ptr addrspace(1) %out, i32 %src ; GFX12-LABEL: v_permlanex16_b32_vll: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_movk_i32 s2, 0x1234 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -744,7 +744,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 @@ -762,7 +762,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -831,7 +831,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 @@ -846,7 +846,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 @@ -917,7 +917,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv(ptr addrspace(1) %out, i32 %src ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 @@ -935,7 +935,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv(ptr addrspace(1) %out, i32 %src ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] @@ -980,7 +980,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi(ptr addrspace(1) %out, i32 % ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] @@ -1025,7 +1025,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc(ptr addrspace(1) %out, i32 % ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] @@ -1070,7 +1070,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc(ptr addrspace(1) %out, i3 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] @@ -1114,7 +1114,7 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid(ptr addrspace(1) %out, i32 % ; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1157,7 +1157,7 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid(ptr addrspace(1) %out, i32 ; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1230,7 +1230,7 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %sr ; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 ; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] @@ -1244,7 +1244,7 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %sr ; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -1289,7 +1289,7 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi(ptr addrspace(1) %out, i32 ; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1333,7 +1333,7 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc(ptr addrspace(1) %out, i32 ; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1377,7 +1377,7 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc(ptr addrspace(1) %out, i ; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1421,7 +1421,7 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid(ptr addrspace(1) %out, i32 ; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1464,7 +1464,7 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid(ptr addrspace(1) %out, i3 ; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1537,7 +1537,7 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %s ; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 ; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] @@ -1551,7 +1551,7 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %s ; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -1596,7 +1596,7 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi(ptr addrspace(1) %out, i32 ; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1640,7 +1640,7 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc(ptr addrspace(1) %out, i32 ; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1684,7 +1684,7 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc(ptr addrspace(1) %out, ; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll index f865418befed7b..7c439ab5afed39 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll @@ -12,7 +12,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %sr ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 @@ -24,7 +24,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %sr ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 @@ -43,7 +43,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %sr ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v1, v1, v0 @@ -55,7 +55,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %sr ; GFX12-GISEL-LABEL: v_permlane16var_b32_vi: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v1, v1, v0 @@ -74,7 +74,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %sr ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0xc1d1 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v1, v1, v0 @@ -86,7 +86,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %sr ; GFX12-GISEL-LABEL: v_permlane16var_b32_vl: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v1, v1, v0 @@ -104,7 +104,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %s ; GFX12-SDAG-LABEL: v_permlane16var_b32_vvv: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v1, v1, v0 @@ -116,7 +116,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %s ; GFX12-GISEL-LABEL: v_permlane16var_b32_vvv: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v1, v1, v0 @@ -136,7 +136,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32 ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0] @@ -148,7 +148,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32 ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_fi: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0] @@ -167,7 +167,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32 ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1] @@ -179,7 +179,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32 ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_bc: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1] @@ -198,7 +198,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1] @@ -210,7 +210,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_fi_bc: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1] @@ -229,7 +229,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %s ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 @@ -241,7 +241,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %s ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 @@ -260,7 +260,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %s ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v1, v1, v0 @@ -272,7 +272,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %s ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vi: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v1, v1, v0 @@ -291,7 +291,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %s ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0xc1d1 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v1, v1, v0 @@ -303,7 +303,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %s ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vl: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v1, v1, v0 @@ -321,7 +321,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 % ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vvv: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v1, v1, v0 @@ -333,7 +333,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 % ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vvv: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v1, v1, v0 @@ -353,7 +353,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32 ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0] @@ -365,7 +365,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_fi: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0] @@ -384,7 +384,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32 ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1] @@ -396,7 +396,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_bc: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1] @@ -415,7 +415,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi_bc(ptr addrspace(1) %out, ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1] @@ -427,7 +427,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi_bc(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_fi_bc: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1] @@ -447,7 +447,7 @@ define amdgpu_kernel void @v_permlane16var_b32_tid_tid(ptr addrspace(1) %out, i3 ; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 @@ -461,7 +461,7 @@ define amdgpu_kernel void @v_permlane16var_b32_tid_tid(ptr addrspace(1) %out, i3 ; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 @@ -482,7 +482,7 @@ define amdgpu_kernel void @v_permlane16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 @@ -496,7 +496,7 @@ define amdgpu_kernel void @v_permlane16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 @@ -519,7 +519,7 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v1, v0, v2 @@ -533,7 +533,7 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v1, v0, v2 @@ -554,7 +554,7 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi(ptr addrspace(1) %out, i ; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0] @@ -568,7 +568,7 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi(ptr addrspace(1) %out, i ; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0] @@ -590,7 +590,7 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_bc(ptr addrspace(1) %out, i ; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1] @@ -604,7 +604,7 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_bc(ptr addrspace(1) %out, i ; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1] @@ -626,7 +626,7 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi_bc(ptr addrspace(1) %out ; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1] @@ -640,7 +640,7 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi_bc(ptr addrspace(1) %out ; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1] @@ -662,7 +662,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_tid_tid(ptr addrspace(1) %out, i ; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 @@ -676,7 +676,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_tid_tid(ptr addrspace(1) %out, i ; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 @@ -697,7 +697,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 @@ -711,7 +711,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 @@ -734,7 +734,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v1, v0, v2 @@ -748,7 +748,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v1, v0, v2 @@ -769,7 +769,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi(ptr addrspace(1) %out, ; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0] @@ -783,7 +783,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0] @@ -805,7 +805,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_bc(ptr addrspace(1) %out, ; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1] @@ -819,7 +819,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_bc(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1] @@ -841,7 +841,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi_bc(ptr addrspace(1) %ou ; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1] @@ -855,7 +855,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi_bc(ptr addrspace(1) %ou ; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll index e6261bb5cb1325..df8b4f9be07c40 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll @@ -39,7 +39,7 @@ define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> ; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[0:3], null ; GFX12-NEXT: buffer_load_b128 v[4:7], off, s[0:3], null th:TH_LOAD_NT ; GFX12-NEXT: buffer_load_b128 v[8:11], off, s[0:3], null th:TH_LOAD_HT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0) @@ -84,7 +84,7 @@ define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load_dlc(<4 x i ; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[0:3], null th:TH_LOAD_NT_RT ; GFX12-NEXT: buffer_load_b128 v[4:7], off, s[0:3], null th:TH_LOAD_RT_NT ; GFX12-NEXT: buffer_load_b128 v[8:11], off, s[0:3], null th:TH_LOAD_NT_HT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 4) @@ -118,7 +118,7 @@ define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { ; GFX12-LABEL: buffer_load_immoffs: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[0:3], null offset:40 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 40, i32 0, i32 0) @@ -151,7 +151,7 @@ define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_movk_i32 s4, 0x1ffc ; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[0:3], s4 offset:4 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 4, i32 8188, i32 0) @@ -180,7 +180,7 @@ define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { ; GFX12-LABEL: buffer_load_ofs: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], null offen -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0) @@ -209,7 +209,7 @@ define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { ; GFX12-LABEL: buffer_load_ofs_imm: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], null offen offset:60 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %ofs = add i32 %1, 60 @@ -239,7 +239,7 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_12bit(<4 x i32> inreg) { ; GFX12-LABEL: buffer_load_voffset_large_12bit: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[0:3], null offset:4092 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 4092, i32 0, i32 0) @@ -271,7 +271,7 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_13bit(<4 x i32> inreg) { ; GFX12-LABEL: buffer_load_voffset_large_13bit: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[0:3], null offset:8188 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 8188, i32 0, i32 0) @@ -303,7 +303,7 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_16bit(<4 x i32> inreg) { ; GFX12-LABEL: buffer_load_voffset_large_16bit: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[0:3], null offset:65532 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 65532, i32 0, i32 0) @@ -335,7 +335,7 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_23bit(<4 x i32> inreg) { ; GFX12-LABEL: buffer_load_voffset_large_23bit: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[0:3], null offset:8388604 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 8388604, i32 0, i32 0) @@ -368,7 +368,7 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_24bit(<4 x i32> inreg) { ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX12-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], null offen offset:8388604 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 16777212, i32 0, i32 0) @@ -398,7 +398,7 @@ define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %ofs) { ; GFX12-LABEL: buffer_load_x1: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0, i32 0) @@ -427,7 +427,7 @@ define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %ofs) { ; GFX12-LABEL: buffer_load_x2: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0, i32 0) @@ -453,7 +453,7 @@ define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 % ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_add_nc_u32_e32 v0, -16, v0 ; GFX12-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], null offen -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %ofs.1 = add i32 %ofs, -16 @@ -487,9 +487,9 @@ define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, ptr addrspace(3) ; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: ds_store_2addr_b32 v0, v2, v2 offset1:4 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: ; return to shader part epilog entry: store float 0.0, ptr addrspace(3) %lds @@ -537,9 +537,9 @@ define amdgpu_ps void @buffer_load_x1_offen_merged_and(<4 x i32> inreg %rsrc, i3 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: buffer_load_b128 v[1:4], v0, s[0:3], null offen offset:4 ; GFX12-NEXT: buffer_load_b64 v[5:6], v0, s[0:3], null offen offset:28 -; GFX12-NEXT: s_waitcnt vmcnt(1) +; GFX12-NEXT: s_wait_loadcnt 0x1 ; GFX12-NEXT: export mrt0 v1, v2, v3, v4 done -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: export mrt0 v5, v6, v0, v0 done ; GFX12-NEXT: s_endpgm main_body: @@ -602,9 +602,9 @@ define amdgpu_ps void @buffer_load_x1_offen_merged_or(<4 x i32> inreg %rsrc, i32 ; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: buffer_load_b128 v[0:3], v4, s[0:3], null offen offset:4 ; GFX12-SDAG-NEXT: buffer_load_b64 v[4:5], v4, s[0:3], null offen offset:28 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x1 ; GFX12-SDAG-NEXT: export mrt0 v0, v1, v2, v3 done -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: export mrt0 v4, v5, v0, v0 done ; GFX12-SDAG-NEXT: s_endpgm ; @@ -624,9 +624,9 @@ define amdgpu_ps void @buffer_load_x1_offen_merged_or(<4 x i32> inreg %rsrc, i32 ; GFX12-GISEL-NEXT: buffer_load_b32 v4, v4, s[0:3], null offen ; GFX12-GISEL-NEXT: buffer_load_b32 v5, v5, s[0:3], null offen ; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(2) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x2 ; GFX12-GISEL-NEXT: export mrt0 v1, v2, v3, v4 done -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: export mrt0 v5, v0, v0, v0 done ; GFX12-GISEL-NEXT: s_endpgm main_body: @@ -690,9 +690,9 @@ define amdgpu_ps void @buffer_load_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc ; GFX12-NEXT: buffer_load_b64 v[1:2], v0, s[0:3], null offen offset:4 ; GFX12-NEXT: buffer_load_b64 v[3:4], v0, s[0:3], null offen offset:12 th:TH_LOAD_NT ; GFX12-NEXT: buffer_load_b64 v[5:6], v0, s[0:3], null offen offset:28 th:TH_LOAD_LU -; GFX12-NEXT: s_waitcnt vmcnt(1) +; GFX12-NEXT: s_wait_loadcnt 0x1 ; GFX12-NEXT: export mrt0 v1, v2, v3, v4 done -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: export mrt0 v5, v6, v0, v0 done ; GFX12-NEXT: s_endpgm main_body: @@ -738,7 +738,7 @@ define amdgpu_ps void @buffer_load_x2_offen_merged_and(<4 x i32> inreg %rsrc, i3 ; GFX12-LABEL: buffer_load_x2_offen_merged_and: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], null offen offset:4 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: export mrt0 v0, v1, v2, v3 done ; GFX12-NEXT: s_endpgm main_body: @@ -783,7 +783,7 @@ define amdgpu_ps void @buffer_load_x2_offen_merged_or(<4 x i32> inreg %rsrc, i32 ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX12-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], null offen offset:4 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: export mrt0 v0, v1, v2, v3 done ; GFX12-NEXT: s_endpgm main_body: @@ -838,9 +838,9 @@ define amdgpu_ps void @buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[0:3], null offset:4 ; GFX12-NEXT: buffer_load_b64 v[4:5], off, s[0:3], null offset:28 -; GFX12-NEXT: s_waitcnt vmcnt(1) +; GFX12-NEXT: s_wait_loadcnt 0x1 ; GFX12-NEXT: export mrt0 v0, v1, v2, v3 done -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: export mrt0 v4, v5, v0, v0 done ; GFX12-NEXT: s_endpgm main_body: @@ -880,7 +880,7 @@ define amdgpu_ps void @buffer_load_x2_offset_merged(<4 x i32> inreg %rsrc) { ; GFX12-LABEL: buffer_load_x2_offset_merged: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[0:3], null offset:4 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: export mrt0 v0, v1, v2, v3 done ; GFX12-NEXT: s_endpgm main_body: @@ -927,7 +927,7 @@ define amdgpu_ps {<4 x float>, <2 x float>, float} @buffer_load_int(<4 x i32> in ; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[0:3], null ; GFX12-NEXT: buffer_load_b64 v[4:5], off, s[0:3], null th:TH_LOAD_NT ; GFX12-NEXT: buffer_load_b32 v6, off, s[0:3], null th:TH_LOAD_HT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0) @@ -967,7 +967,7 @@ define amdgpu_ps float @raw_buffer_load_ubyte(<4 x i32> inreg %rsrc) { ; GFX12-LABEL: raw_buffer_load_ubyte: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_u8 v0, off, s[0:3], null -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -1002,7 +1002,7 @@ define amdgpu_ps float @raw_buffer_load_i16(<4 x i32> inreg %rsrc) { ; GFX12-LABEL: raw_buffer_load_i16: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_u16 v0, off, s[0:3], null -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -1037,7 +1037,7 @@ define amdgpu_ps float @raw_buffer_load_sbyte(<4 x i32> inreg %rsrc) { ; GFX12-LABEL: raw_buffer_load_sbyte: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_i8 v0, off, s[0:3], null -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -1072,7 +1072,7 @@ define amdgpu_ps float @raw_buffer_load_sshort(<4 x i32> inreg %rsrc) { ; GFX12-LABEL: raw_buffer_load_sshort: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_i16 v0, off, s[0:3], null -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX12-NEXT: ; return to shader part epilog main_body: @@ -1108,7 +1108,7 @@ define amdgpu_ps void @raw_buffer_load_f16(<4 x i32> inreg %rsrc, ptr addrspace( ; GFX12-LABEL: raw_buffer_load_f16: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_u16 v1, off, s[0:3], null -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ds_store_b16 v0, v1 ; GFX12-NEXT: s_endpgm main_body: @@ -1143,7 +1143,7 @@ define amdgpu_ps void @raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, ptr addrspac ; GFX12-LABEL: raw_buffer_load_v2f16: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ds_store_b32 v0, v1 ; GFX12-NEXT: s_endpgm main_body: @@ -1178,7 +1178,7 @@ define amdgpu_ps void @raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, ptr addrspac ; GFX12-LABEL: raw_buffer_load_v4f16: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_b64 v[1:2], off, s[0:3], null -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ds_store_b64 v0, v[1:2] ; GFX12-NEXT: s_endpgm main_body: @@ -1213,7 +1213,7 @@ define amdgpu_ps void @raw_buffer_load_v2i16(<4 x i32> inreg %rsrc, ptr addrspac ; GFX12-LABEL: raw_buffer_load_v2i16: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ds_store_b32 v0, v1 ; GFX12-NEXT: s_endpgm main_body: @@ -1248,7 +1248,7 @@ define amdgpu_ps void @raw_buffer_load_v4i16(<4 x i32> inreg %rsrc, ptr addrspac ; GFX12-LABEL: raw_buffer_load_v4i16: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_b64 v[1:2], off, s[0:3], null -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ds_store_b64 v0, v[1:2] ; GFX12-NEXT: s_endpgm main_body: @@ -1295,9 +1295,9 @@ define amdgpu_ps void @raw_buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[0:3], null offset:4 ; GFX12-NEXT: buffer_load_b64 v[4:5], off, s[0:3], null offset:28 -; GFX12-NEXT: s_waitcnt vmcnt(1) +; GFX12-NEXT: s_wait_loadcnt 0x1 ; GFX12-NEXT: export mrt0 v0, v1, v2, v3 done -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: export mrt0 v4, v5, v0, v0 done ; GFX12-NEXT: s_endpgm main_body: @@ -1362,9 +1362,9 @@ define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[0:3], null offset:4 scope:SCOPE_SE ; GFX12-NEXT: buffer_load_b64 v[4:5], off, s[0:3], null offset:28 scope:SCOPE_SE -; GFX12-NEXT: s_waitcnt vmcnt(1) +; GFX12-NEXT: s_wait_loadcnt 0x1 ; GFX12-NEXT: export mrt0 v0, v1, v2, v3 done -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: export mrt0 v4, v5, v0, v0 done ; GFX12-NEXT: s_endpgm main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll index a3f275fb716ed6..4443a1e0dcb01b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll @@ -34,7 +34,7 @@ define amdgpu_ps half @tbuffer_load_d16_x(<4 x i32> inreg %rsrc) { ; GFX12-PACKED-LABEL: tbuffer_load_d16_x: ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: tbuffer_load_d16_format_x v0, off, s[0:3], null format:[BUF_FMT_32_FLOAT] -; GFX12-PACKED-NEXT: s_waitcnt vmcnt(0) +; GFX12-PACKED-NEXT: s_wait_loadcnt 0x0 ; GFX12-PACKED-NEXT: ; return to shader part epilog main_body: %data = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 22, i32 0) @@ -73,7 +73,7 @@ define amdgpu_ps half @tbuffer_load_d16_xy(<4 x i32> inreg %rsrc) { ; GFX12-PACKED-LABEL: tbuffer_load_d16_xy: ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: tbuffer_load_d16_format_xy v0, off, s[0:3], null format:[BUF_FMT_32_FLOAT] -; GFX12-PACKED-NEXT: s_waitcnt vmcnt(0) +; GFX12-PACKED-NEXT: s_wait_loadcnt 0x0 ; GFX12-PACKED-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-PACKED-NEXT: ; return to shader part epilog main_body: @@ -114,7 +114,7 @@ define amdgpu_ps half @tbuffer_load_d16_xyz(<4 x i32> inreg %rsrc) { ; GFX12-PACKED-LABEL: tbuffer_load_d16_xyz: ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: tbuffer_load_d16_format_xyz v[0:1], off, s[0:3], null format:[BUF_FMT_32_FLOAT] -; GFX12-PACKED-NEXT: s_waitcnt vmcnt(0) +; GFX12-PACKED-NEXT: s_wait_loadcnt 0x0 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-PACKED-NEXT: ; return to shader part epilog main_body: @@ -155,7 +155,7 @@ define amdgpu_ps half @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) { ; GFX12-PACKED-LABEL: tbuffer_load_d16_xyzw: ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: tbuffer_load_d16_format_xyzw v[0:1], off, s[0:3], null format:[BUF_FMT_32_FLOAT] -; GFX12-PACKED-NEXT: s_waitcnt vmcnt(0) +; GFX12-PACKED-NEXT: s_wait_loadcnt 0x0 ; GFX12-PACKED-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX12-PACKED-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll index 7ce5571f59f942..9290feb73a1c4d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll @@ -43,7 +43,7 @@ define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_l ; GFX12-NEXT: tbuffer_load_format_xyzw v[4:7], off, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] th:TH_LOAD_NT ; GFX12-NEXT: tbuffer_load_format_xyzw v[8:11], off, s[0:3], null format:[BUF_FMT_32_FLOAT] th:TH_LOAD_HT ; GFX12-NEXT: tbuffer_load_format_xyzw v[12:15], off, s[0:3], null format:[BUF_FMT_32_FLOAT] th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 78, i32 0) @@ -82,7 +82,7 @@ define amdgpu_vs <4 x float> @tbuffer_load_immoffs(<4 x i32> inreg) { ; GFX12-LABEL: tbuffer_load_immoffs: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: tbuffer_load_format_xyzw v[0:3], off, s[0:3], null format:78 offset:42 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 42, i32 0, i32 78, i32 0) @@ -112,7 +112,7 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_12bit(<4 x i32> inreg) { ; GFX12-LABEL: buffer_load_voffset_large_12bit: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: tbuffer_load_format_xyzw v[0:3], off, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] offset:4092 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %0, i32 4092, i32 0, i32 63, i32 0) @@ -144,7 +144,7 @@ define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_13bit(<4 x i32> inreg) ; GFX12-LABEL: tbuffer_load_voffset_large_13bit: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: tbuffer_load_format_xyzw v[0:3], off, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] offset:8188 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %0, i32 8188, i32 0, i32 63, i32 0) @@ -176,7 +176,7 @@ define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_16bit(<4 x i32> inreg) ; GFX12-LABEL: tbuffer_load_voffset_large_16bit: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: tbuffer_load_format_xyzw v[0:3], off, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] offset:65532 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %0, i32 65532, i32 0, i32 63, i32 0) @@ -208,7 +208,7 @@ define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_23bit(<4 x i32> inreg) ; GFX12-LABEL: tbuffer_load_voffset_large_23bit: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: tbuffer_load_format_xyzw v[0:3], off, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] offset:8388604 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %0, i32 8388604, i32 0, i32 63, i32 0) @@ -241,7 +241,7 @@ define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_24bit(<4 x i32> inreg) ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX12-NEXT: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] offen offset:8388604 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %0, i32 16777212, i32 0, i32 63, i32 0) @@ -282,7 +282,7 @@ define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_l ; GFX12-NEXT: tbuffer_load_format_xyzw v[0:3], off, s[0:3], s5 format:[BUF_FMT_8_8_8_8_SINT] offset:4095 ; GFX12-NEXT: tbuffer_load_format_xyzw v[4:7], off, s[0:3], s4 format:[BUF_FMT_32_32_32_32_SINT] offset:73 ; GFX12-NEXT: tbuffer_load_format_xyzw v[8:11], off, s[0:3], s4 format:77 offset:1 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 4095, i32 61, i32 47, i32 0) %vdata_glc = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 73, i32 %soffs, i32 62, i32 0) @@ -318,7 +318,7 @@ define amdgpu_vs <4 x float> @tbuffer_load_ofs(<4 x i32> inreg, i32 %voffs) { ; GFX12-LABEL: tbuffer_load_ofs: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], null format:78 offen -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 %voffs, i32 0, i32 78, i32 0) @@ -348,7 +348,7 @@ define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(<4 x i32> inreg, i32 %voffs) ; GFX12-LABEL: tbuffer_load_ofs_imm: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], null format:78 offen offset:52 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %ofs = add i32 %voffs, 52 @@ -379,7 +379,7 @@ define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { ; GFX12-LABEL: buffer_load_xy: ; GFX12: ; %bb.0: ; GFX12-NEXT: tbuffer_load_format_xy v[0:1], off, s[0:3], null format:77 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %vdata = call <2 x i32> @llvm.amdgcn.raw.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 77, i32 0) %vdata.f = bitcast <2 x i32> %vdata to <2 x float> @@ -408,7 +408,7 @@ define amdgpu_vs float @buffer_load_x(<4 x i32> inreg %rsrc) { ; GFX12-LABEL: buffer_load_x: ; GFX12: ; %bb.0: ; GFX12-NEXT: tbuffer_load_format_x v0, off, s[0:3], null format:77 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %vdata = call i32 @llvm.amdgcn.raw.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 77, i32 0) %vdata.f = bitcast i32 %vdata to float diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll index f258b443009aea..59b3776b844814 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll @@ -53,7 +53,7 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) { ; GFX12-PACKED-NEXT: s_clause 0x1 ; GFX12-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34 ; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-PACKED-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] ; GFX12-PACKED-NEXT: s_nop 0 @@ -113,7 +113,7 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %dat ; GFX12-PACKED-NEXT: s_clause 0x1 ; GFX12-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34 ; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-PACKED-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] ; GFX12-PACKED-NEXT: s_nop 0 @@ -181,7 +181,7 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; GFX12-PACKED-SDAG-NEXT: s_clause 0x1 ; GFX12-PACKED-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-PACKED-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-PACKED-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-SDAG-NEXT: s_and_b32 s5, s5, 0xffff ; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v1, s5 @@ -195,7 +195,7 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; GFX12-PACKED-GISEL-NEXT: s_clause 0x1 ; GFX12-PACKED-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-PACKED-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-GISEL-NEXT: s_pack_lh_b32_b16 s4, s4, s4 ; GFX12-PACKED-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v0, s4 @@ -266,7 +266,7 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d ; GFX12-PACKED-NEXT: s_clause 0x1 ; GFX12-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-PACKED-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX12-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll index 818e8eb9463953..ad90a0a81ea956 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -93,7 +93,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; VARIANT4: ; %bb.0: ; %entry ; VARIANT4-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; VARIANT4-NEXT: v_lshlrev_b32_e32 v3, 2, v0 -; VARIANT4-NEXT: s_waitcnt lgkmcnt(0) +; VARIANT4-NEXT: s_wait_kmcnt 0x0 ; VARIANT4-NEXT: v_xad_u32 v1, v0, -1, s2 ; VARIANT4-NEXT: global_store_b32 v3, v0, s[0:1] ; VARIANT4-NEXT: s_barrier_signal -1 @@ -105,7 +105,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; VARIANT4-NEXT: s_delay_alu instid0(VALU_DEP_2) ; VARIANT4-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s1, v2, vcc_lo ; VARIANT4-NEXT: global_load_b32 v0, v[1:2], off -; VARIANT4-NEXT: s_waitcnt vmcnt(0) +; VARIANT4-NEXT: s_wait_loadcnt 0x0 ; VARIANT4-NEXT: global_store_b32 v3, v0, s[0:1] ; VARIANT4-NEXT: s_nop 0 ; VARIANT4-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -115,7 +115,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; VARIANT5: ; %bb.0: ; %entry ; VARIANT5-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; VARIANT5-NEXT: v_lshlrev_b32_e32 v3, 2, v0 -; VARIANT5-NEXT: s_waitcnt lgkmcnt(0) +; VARIANT5-NEXT: s_wait_kmcnt 0x0 ; VARIANT5-NEXT: v_xad_u32 v1, v0, -1, s2 ; VARIANT5-NEXT: global_store_b32 v3, v0, s[0:1] ; VARIANT5-NEXT: s_barrier_signal -1 @@ -127,7 +127,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; VARIANT5-NEXT: s_delay_alu instid0(VALU_DEP_2) ; VARIANT5-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s1, v2, vcc_lo ; VARIANT5-NEXT: global_load_b32 v0, v[1:2], off -; VARIANT5-NEXT: s_waitcnt vmcnt(0) +; VARIANT5-NEXT: s_wait_loadcnt 0x0 ; VARIANT5-NEXT: global_store_b32 v3, v0, s[0:1] ; VARIANT5-NEXT: s_nop 0 ; VARIANT5-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -137,7 +137,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; VARIANT6: ; %bb.0: ; %entry ; VARIANT6-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; VARIANT6-NEXT: v_lshlrev_b32_e32 v5, 2, v0 -; VARIANT6-NEXT: s_waitcnt lgkmcnt(0) +; VARIANT6-NEXT: s_wait_kmcnt 0x0 ; VARIANT6-NEXT: s_sub_co_i32 s2, s2, 1 ; VARIANT6-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 ; VARIANT6-NEXT: v_sub_nc_u32_e32 v1, s2, v0 @@ -151,7 +151,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_2) ; VARIANT6-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v4, v2, vcc_lo ; VARIANT6-NEXT: global_load_b32 v0, v[1:2], off -; VARIANT6-NEXT: s_waitcnt vmcnt(0) +; VARIANT6-NEXT: s_wait_loadcnt 0x0 ; VARIANT6-NEXT: global_store_b32 v5, v0, s[0:1] ; VARIANT6-NEXT: s_nop 0 ; VARIANT6-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll index 220002ce4f6c45..00caa2a36c58da 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll @@ -10,7 +10,7 @@ define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] ; GCN-NEXT: s_barrier_signal -1 ; GCN-NEXT: s_barrier_wait -1 @@ -26,7 +26,7 @@ define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] ; GLOBAL-ISEL-NEXT: s_barrier_signal -1 ; GLOBAL-ISEL-NEXT: s_barrier_wait -1 @@ -54,7 +54,7 @@ define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] ; GCN-NEXT: s_barrier_signal 1 ; GCN-NEXT: s_barrier_wait 1 @@ -70,7 +70,7 @@ define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] ; GLOBAL-ISEL-NEXT: s_barrier_signal 1 ; GLOBAL-ISEL-NEXT: s_barrier_wait 1 @@ -98,7 +98,7 @@ define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] ; GCN-NEXT: s_barrier_signal 0 ; GCN-NEXT: s_barrier_wait 0 @@ -114,7 +114,7 @@ define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] ; GLOBAL-ISEL-NEXT: s_barrier_signal 0 ; GLOBAL-ISEL-NEXT: s_barrier_wait 0 @@ -144,7 +144,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0 ; GCN-NEXT: s_mov_b32 m0, 1 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v1, s[0:1] ; GCN-NEXT: s_barrier_signal m0 ; GCN-NEXT: s_barrier_wait 1 @@ -161,7 +161,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0 ; GLOBAL-ISEL-NEXT: s_mov_b32 m0, 1 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] ; GLOBAL-ISEL-NEXT: s_barrier_signal m0 ; GLOBAL-ISEL-NEXT: s_barrier_wait 1 @@ -184,7 +184,11 @@ entry: define void @test2_s_barrier_signal_var(i32 %arg) { ; GCN-LABEL: test2_s_barrier_signal_var: ; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_expcnt 0x0 +; GCN-NEXT: s_wait_samplecnt 0x0 +; GCN-NEXT: s_wait_bvhcnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: s_mov_b32 m0, s0 @@ -193,7 +197,11 @@ define void @test2_s_barrier_signal_var(i32 %arg) { ; ; GLOBAL-ISEL-LABEL: test2_s_barrier_signal_var: ; GLOBAL-ISEL: ; %bb.0: -; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GLOBAL-ISEL-NEXT: s_wait_expcnt 0x0 +; GLOBAL-ISEL-NEXT: s_wait_samplecnt 0x0 +; GLOBAL-ISEL-NEXT: s_wait_bvhcnt 0x0 +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v0 ; GLOBAL-ISEL-NEXT: s_barrier_signal m0 ; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31] @@ -206,7 +214,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_barrier_signal_isfirst -1 ; GCN-NEXT: s_cselect_b32 s3, s3, s5 @@ -214,7 +222,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; GCN-NEXT: s_clause 0x1 ; GCN-NEXT: global_load_b32 v2, v1, s[0:1] ; GCN-NEXT: global_load_b32 v1, v1, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v2 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_nop 0 @@ -225,7 +233,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; GLOBAL-ISEL: ; %bb.0: ; %entry ; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst -1 ; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0 @@ -236,7 +244,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; GLOBAL-ISEL-NEXT: s_clause 0x1 ; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1] ; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3] -; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_loadcnt 0x0 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_nop 0 @@ -260,7 +268,7 @@ define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_barrier_signal_isfirst 1 ; GCN-NEXT: s_cselect_b32 s3, s3, s5 @@ -268,7 +276,7 @@ define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; GCN-NEXT: s_clause 0x1 ; GCN-NEXT: global_load_b32 v2, v1, s[0:1] ; GCN-NEXT: global_load_b32 v1, v1, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v2 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_nop 0 @@ -279,7 +287,7 @@ define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; GLOBAL-ISEL: ; %bb.0: ; %entry ; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst 1 ; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0 @@ -290,7 +298,7 @@ define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; GLOBAL-ISEL-NEXT: s_clause 0x1 ; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1] ; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3] -; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_loadcnt 0x0 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_nop 0 @@ -314,7 +322,7 @@ define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_barrier_signal_isfirst 1 ; GCN-NEXT: s_cselect_b32 s3, s3, s5 @@ -322,7 +330,7 @@ define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; GCN-NEXT: s_clause 0x1 ; GCN-NEXT: global_load_b32 v2, v1, s[0:1] ; GCN-NEXT: global_load_b32 v1, v1, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v2 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_nop 0 @@ -333,7 +341,7 @@ define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; GLOBAL-ISEL: ; %bb.0: ; %entry ; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst 1 ; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0 @@ -344,7 +352,7 @@ define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; GLOBAL-ISEL-NEXT: s_clause 0x1 ; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1] ; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3] -; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_loadcnt 0x0 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_nop 0 @@ -369,7 +377,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) % ; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: s_mov_b32 m0, 1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_barrier_signal_isfirst m0 ; GCN-NEXT: s_cselect_b32 s3, s3, s5 @@ -377,7 +385,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) % ; GCN-NEXT: s_clause 0x1 ; GCN-NEXT: global_load_b32 v2, v1, s[0:1] ; GCN-NEXT: global_load_b32 v1, v1, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v2 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_nop 0 @@ -389,7 +397,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) % ; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_mov_b32 m0, 1 -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst m0 ; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0 @@ -400,7 +408,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) % ; GLOBAL-ISEL-NEXT: s_clause 0x1 ; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1] ; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3] -; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_loadcnt 0x0 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_nop 0 @@ -422,7 +430,11 @@ entry: define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, i32 %arg, ptr addrspace(1) %out) { ; GCN-LABEL: test2_s_barrier_signal_isfirst_var: ; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_expcnt 0x0 +; GCN-NEXT: s_wait_samplecnt 0x0 +; GCN-NEXT: s_wait_bvhcnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v31 ; GCN-NEXT: v_readfirstlane_b32 s0, v6 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -437,15 +449,19 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa ; GCN-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3 ; GCN-NEXT: global_load_b32 v0, v[0:1], off ; GCN-NEXT: global_load_b32 v1, v[2:3], off -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: v_mul_lo_u32 v0, v1, v0 ; GCN-NEXT: global_store_b32 v[7:8], v0, off -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GLOBAL-ISEL-LABEL: test2_s_barrier_signal_isfirst_var: ; GLOBAL-ISEL: ; %bb.0: -; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GLOBAL-ISEL-NEXT: s_wait_expcnt 0x0 +; GLOBAL-ISEL-NEXT: s_wait_samplecnt 0x0 +; GLOBAL-ISEL-NEXT: s_wait_bvhcnt 0x0 +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: v_and_b32_e32 v9, 0x3ff, v31 ; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v6 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -462,10 +478,10 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa ; GLOBAL-ISEL-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3 ; GLOBAL-ISEL-NEXT: global_load_b32 v0, v[0:1], off ; GLOBAL-ISEL-NEXT: global_load_b32 v1, v[2:3], off -; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_loadcnt 0x0 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: global_store_b32 v[7:8], v0, off -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31] %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp @@ -487,7 +503,7 @@ define amdgpu_kernel void @test1_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC ; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] ; GCN-NEXT: s_mov_b32 m0, s2 @@ -504,7 +520,7 @@ define amdgpu_kernel void @test1_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s2 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] ; GLOBAL-ISEL-NEXT: s_barrier_init -1 @@ -531,7 +547,7 @@ define amdgpu_kernel void @test2_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC ; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] ; GCN-NEXT: s_mov_b32 m0, s2 @@ -548,7 +564,7 @@ define amdgpu_kernel void @test2_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s2 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] ; GLOBAL-ISEL-NEXT: s_barrier_init 1 @@ -575,7 +591,7 @@ define amdgpu_kernel void @test3_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC ; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] ; GCN-NEXT: s_mov_b32 m0, s2 @@ -592,7 +608,7 @@ define amdgpu_kernel void @test3_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s2 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] ; GLOBAL-ISEL-NEXT: s_barrier_init 0 @@ -619,7 +635,7 @@ define amdgpu_kernel void @test4_s_barrier_init(ptr addrspace(1) %out, i32 %bar, ; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_lshl_b32 s3, s3, 16 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] ; GCN-NEXT: s_or_b32 s2, s2, s3 @@ -637,7 +653,7 @@ define amdgpu_kernel void @test4_s_barrier_init(ptr addrspace(1) %out, i32 %bar, ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_lshl_b32 s3, 16, s3 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] ; GLOBAL-ISEL-NEXT: s_or_b32 m0, s2, s3 @@ -660,7 +676,11 @@ entry: define void @test5_s_barrier_init_m0(i32 %arg1 ,i32 %arg2) { ; GCN-LABEL: test5_s_barrier_init_m0: ; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_expcnt 0x0 +; GCN-NEXT: s_wait_samplecnt 0x0 +; GCN-NEXT: s_wait_bvhcnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 @@ -672,7 +692,11 @@ define void @test5_s_barrier_init_m0(i32 %arg1 ,i32 %arg2) { ; ; GLOBAL-ISEL-LABEL: test5_s_barrier_init_m0: ; GLOBAL-ISEL: ; %bb.0: -; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GLOBAL-ISEL-NEXT: s_wait_expcnt 0x0 +; GLOBAL-ISEL-NEXT: s_wait_samplecnt 0x0 +; GLOBAL-ISEL-NEXT: s_wait_bvhcnt 0x0 +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 s0, v1 ; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 s1, v0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -694,7 +718,7 @@ define amdgpu_kernel void @test1_s_barrier_join(ptr addrspace(1) %out) #0 { ; GCN-NEXT: s_barrier_join -1 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -707,7 +731,7 @@ define amdgpu_kernel void @test1_s_barrier_join(ptr addrspace(1) %out) #0 { ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] ; GLOBAL-ISEL-NEXT: s_barrier_join -1 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] @@ -734,7 +758,7 @@ define amdgpu_kernel void @test2_s_barrier_join(ptr addrspace(1) %out) #0 { ; GCN-NEXT: s_barrier_join 1 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -747,7 +771,7 @@ define amdgpu_kernel void @test2_s_barrier_join(ptr addrspace(1) %out) #0 { ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] ; GLOBAL-ISEL-NEXT: s_barrier_join 1 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] @@ -774,7 +798,7 @@ define amdgpu_kernel void @test3_s_barrier_join(ptr addrspace(1) %out) #0 { ; GCN-NEXT: s_barrier_join 0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -787,7 +811,7 @@ define amdgpu_kernel void @test3_s_barrier_join(ptr addrspace(1) %out) #0 { ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] ; GLOBAL-ISEL-NEXT: s_barrier_join 0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] @@ -814,7 +838,7 @@ define amdgpu_kernel void @test4_s_barrier_join_m0(ptr addrspace(1) %out, i32 %b ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: global_store_b32 v3, v1, s[0:1] ; GCN-NEXT: s_barrier_join m0 @@ -830,7 +854,7 @@ define amdgpu_kernel void @test4_s_barrier_join_m0(ptr addrspace(1) %out, i32 %b ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s2 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] ; GLOBAL-ISEL-NEXT: s_barrier_join m0 @@ -852,7 +876,11 @@ entry: define void @test5_s_barrier_join_m0(i32 %arg) { ; GCN-LABEL: test5_s_barrier_join_m0: ; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_expcnt 0x0 +; GCN-NEXT: s_wait_samplecnt 0x0 +; GCN-NEXT: s_wait_bvhcnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: s_mov_b32 m0, s0 @@ -861,7 +889,11 @@ define void @test5_s_barrier_join_m0(i32 %arg) { ; ; GLOBAL-ISEL-LABEL: test5_s_barrier_join_m0: ; GLOBAL-ISEL: ; %bb.0: -; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GLOBAL-ISEL-NEXT: s_wait_expcnt 0x0 +; GLOBAL-ISEL-NEXT: s_wait_samplecnt 0x0 +; GLOBAL-ISEL-NEXT: s_wait_bvhcnt 0x0 +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v0 ; GLOBAL-ISEL-NEXT: s_barrier_join m0 ; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31] @@ -874,7 +906,7 @@ define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrsp ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_barrier_leave ; GCN-NEXT: s_cselect_b32 s3, s3, s5 @@ -882,7 +914,7 @@ define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrsp ; GCN-NEXT: s_clause 0x1 ; GCN-NEXT: global_load_b32 v2, v1, s[0:1] ; GCN-NEXT: global_load_b32 v1, v1, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v2 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_nop 0 @@ -893,7 +925,7 @@ define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrsp ; GLOBAL-ISEL: ; %bb.0: ; %entry ; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_barrier_leave ; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0 @@ -904,7 +936,7 @@ define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrsp ; GLOBAL-ISEL-NEXT: s_clause 0x1 ; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1] ; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3] -; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_loadcnt 0x0 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_nop 0 @@ -932,7 +964,7 @@ define amdgpu_kernel void @test1_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; GCN-NEXT: s_wakeup_barrier -1 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -945,7 +977,7 @@ define amdgpu_kernel void @test1_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] ; GLOBAL-ISEL-NEXT: s_wakeup_barrier -1 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] @@ -972,7 +1004,7 @@ define amdgpu_kernel void @test2_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; GCN-NEXT: s_wakeup_barrier 1 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -985,7 +1017,7 @@ define amdgpu_kernel void @test2_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] ; GLOBAL-ISEL-NEXT: s_wakeup_barrier 1 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] @@ -1012,7 +1044,7 @@ define amdgpu_kernel void @test3_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; GCN-NEXT: s_wakeup_barrier 0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1025,7 +1057,7 @@ define amdgpu_kernel void @test3_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] ; GLOBAL-ISEL-NEXT: s_wakeup_barrier 0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] @@ -1052,7 +1084,7 @@ define amdgpu_kernel void @test4_s_wakeup_barrier_m0(ptr addrspace(1) %out, i32 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: global_store_b32 v3, v1, s[0:1] ; GCN-NEXT: s_wakeup_barrier m0 @@ -1068,7 +1100,7 @@ define amdgpu_kernel void @test4_s_wakeup_barrier_m0(ptr addrspace(1) %out, i32 ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s2 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] ; GLOBAL-ISEL-NEXT: s_wakeup_barrier m0 @@ -1090,7 +1122,11 @@ entry: define void @test5_s_wakeup_barrier_m0(i32 %arg) { ; GCN-LABEL: test5_s_wakeup_barrier_m0: ; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_expcnt 0x0 +; GCN-NEXT: s_wait_samplecnt 0x0 +; GCN-NEXT: s_wait_bvhcnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: s_mov_b32 m0, s0 @@ -1099,7 +1135,11 @@ define void @test5_s_wakeup_barrier_m0(i32 %arg) { ; ; GLOBAL-ISEL-LABEL: test5_s_wakeup_barrier_m0: ; GLOBAL-ISEL: ; %bb.0: -; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GLOBAL-ISEL-NEXT: s_wait_expcnt 0x0 +; GLOBAL-ISEL-NEXT: s_wait_samplecnt 0x0 +; GLOBAL-ISEL-NEXT: s_wait_bvhcnt 0x0 +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v0 ; GLOBAL-ISEL-NEXT: s_wakeup_barrier m0 ; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31] @@ -1112,7 +1152,7 @@ define amdgpu_kernel void @test1_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_get_barrier_state s2, -1 ; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1124,10 +1164,10 @@ define amdgpu_kernel void @test1_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; GLOBAL-ISEL: ; %bb.0: ; %entry ; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, -1 -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1148,7 +1188,7 @@ define amdgpu_kernel void @test2_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_get_barrier_state s2, 1 ; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1160,10 +1200,10 @@ define amdgpu_kernel void @test2_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; GLOBAL-ISEL: ; %bb.0: ; %entry ; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, 1 -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1184,7 +1224,7 @@ define amdgpu_kernel void @test3_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_get_barrier_state s2, 0 ; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1196,10 +1236,10 @@ define amdgpu_kernel void @test3_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; GLOBAL-ISEL: ; %bb.0: ; %entry ; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, 0 -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1220,11 +1260,11 @@ define amdgpu_kernel void @test4_s_get_barrier_state_m0(ptr addrspace(1) %out, i ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: global_store_b32 v0, v1, s[0:1] ; GCN-NEXT: s_get_barrier_state s2, m0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1236,11 +1276,11 @@ define amdgpu_kernel void @test4_s_get_barrier_state_m0(ptr addrspace(1) %out, i ; GLOBAL-ISEL: ; %bb.0: ; %entry ; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s2 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, m0 -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1259,21 +1299,29 @@ entry: define i32 @test5_s_get_barrier_state_m0(i32 %arg) { ; GCN-LABEL: test5_s_get_barrier_state_m0: ; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_expcnt 0x0 +; GCN-NEXT: s_wait_samplecnt 0x0 +; GCN-NEXT: s_wait_bvhcnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_2) ; GCN-NEXT: s_mov_b32 m0, s0 ; GCN-NEXT: s_get_barrier_state s0, m0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GLOBAL-ISEL-LABEL: test5_s_get_barrier_state_m0: ; GLOBAL-ISEL: ; %bb.0: -; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GLOBAL-ISEL-NEXT: s_wait_expcnt 0x0 +; GLOBAL-ISEL-NEXT: s_wait_samplecnt 0x0 +; GLOBAL-ISEL-NEXT: s_wait_bvhcnt 0x0 +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v0 ; GLOBAL-ISEL-NEXT: s_get_barrier_state s0, m0 -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v0, s0 ; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31] @@ -1289,7 +1337,7 @@ define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 { ; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] ; GCN-NEXT: s_barrier_signal -1 ; GCN-NEXT: s_barrier_wait -1 @@ -1305,7 +1353,7 @@ define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 { ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] ; GLOBAL-ISEL-NEXT: s_barrier_signal -1 ; GLOBAL-ISEL-NEXT: s_barrier_wait -1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll index 7dffd1a75ed0ec..71eabf2d4473dd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll @@ -35,7 +35,7 @@ define amdgpu_ps void @s_buffer_load_imm(<4 x i32> inreg %desc) { ; GFX12-LABEL: s_buffer_load_imm: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x4 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: export mrt0 v0, v0, v0, v0 done ; GFX12-NEXT: s_endpgm @@ -74,7 +74,7 @@ define amdgpu_ps void @s_buffer_load_index(<4 x i32> inreg %desc, i32 inreg %ind ; GFX12-LABEL: s_buffer_load_index: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: export mrt0 v0, v0, v0, v0 done ; GFX12-NEXT: s_endpgm @@ -103,7 +103,7 @@ define amdgpu_ps void @s_buffer_load_index_divergent(<4 x i32> inreg %desc, i32 ; GFX12-LABEL: s_buffer_load_index_divergent: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: export mrt0 v0, v0, v0, v0 done ; GFX12-NEXT: s_endpgm main_body: @@ -144,7 +144,7 @@ define amdgpu_ps void @s_buffer_loadx2_imm(<4 x i32> inreg %desc) { ; GFX12-LABEL: s_buffer_loadx2_imm: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_buffer_load_b64 s[0:1], s[0:3], 0x40 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-NEXT: export mrt0 v0, v1, v0, v0 done @@ -189,7 +189,7 @@ define amdgpu_ps void @s_buffer_loadx2_index(<4 x i32> inreg %desc, i32 inreg %i ; GFX12-LABEL: s_buffer_loadx2_index: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_buffer_load_b64 s[0:1], s[0:3], s4 offset:0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-NEXT: export mrt0 v0, v1, v0, v0 done @@ -221,7 +221,7 @@ define amdgpu_ps void @s_buffer_loadx2_index_divergent(<4 x i32> inreg %desc, i3 ; GFX12-LABEL: s_buffer_loadx2_index_divergent: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: export mrt0 v0, v1, v0, v0 done ; GFX12-NEXT: s_endpgm main_body: @@ -267,7 +267,7 @@ define amdgpu_ps void @s_buffer_loadx3_imm(<4 x i32> inreg %desc) { ; GFX12-LABEL: s_buffer_loadx3_imm: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_buffer_load_b96 s[0:2], s[0:3], 0x40 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 @@ -317,7 +317,7 @@ define amdgpu_ps void @s_buffer_loadx3_index(<4 x i32> inreg %desc, i32 inreg %i ; GFX12-LABEL: s_buffer_loadx3_index: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_buffer_load_b96 s[0:2], s[0:3], s4 offset:0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 @@ -358,7 +358,7 @@ define amdgpu_ps void @s_buffer_loadx3_index_divergent(<4 x i32> inreg %desc, i3 ; GFX12-LABEL: s_buffer_loadx3_index_divergent: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_b96 v[0:2], v0, s[0:3], null offen -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: export mrt0 v0, v1, v2, v0 done ; GFX12-NEXT: s_endpgm main_body: @@ -408,7 +408,7 @@ define amdgpu_ps void @s_buffer_loadx4_imm(<4 x i32> inreg %desc) { ; GFX12-LABEL: s_buffer_loadx4_imm: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_buffer_load_b128 s[0:3], s[0:3], 0xc8 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 @@ -463,7 +463,7 @@ define amdgpu_ps void @s_buffer_loadx4_index(<4 x i32> inreg %desc, i32 inreg %i ; GFX12-LABEL: s_buffer_loadx4_index: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_buffer_load_b128 s[0:3], s[0:3], s4 offset:0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 @@ -499,7 +499,7 @@ define amdgpu_ps void @s_buffer_loadx4_index_divergent(<4 x i32> inreg %desc, i3 ; GFX12-LABEL: s_buffer_loadx4_index_divergent: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], null offen -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: export mrt0 v0, v1, v2, v3 done ; GFX12-NEXT: s_endpgm main_body: @@ -544,7 +544,7 @@ define amdgpu_ps void @s_buffer_load_imm_mergex2(<4 x i32> inreg %desc) { ; GFX12-LABEL: s_buffer_load_imm_mergex2: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_buffer_load_b64 s[0:1], s[0:3], 0x4 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-NEXT: export mrt0 v0, v1, v0, v0 done @@ -595,7 +595,7 @@ define amdgpu_ps void @s_buffer_load_imm_mergex4(<4 x i32> inreg %desc) { ; GFX12-LABEL: s_buffer_load_imm_mergex4: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_buffer_load_b128 s[0:3], s[0:3], 0x8 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 @@ -728,11 +728,11 @@ define amdgpu_ps void @s_buffer_load_index_across_bb(<4 x i32> inreg %desc, i32 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-NEXT: v_or_b32_e32 v0, 8, v0 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: export mrt0 v0, v0, v0, v0 done ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -771,7 +771,7 @@ define amdgpu_ps void @s_buffer_load_index_across_bb_merged(<4 x i32> inreg %des ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:8 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: export mrt0 v0, v1, v0, v0 done ; GFX12-NEXT: s_endpgm main_body: @@ -823,7 +823,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_neg1(<4 x i32> inreg %desc) { ; GFX12: ; %bb.0: ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -1, i32 0) ret i32 %load @@ -869,7 +869,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_neg4(<4 x i32> inreg %desc) { ; GFX12: ; %bb.0: ; GFX12-NEXT: s_mov_b32 s4, -4 ; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -4, i32 0) ret i32 %load @@ -915,7 +915,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_neg8(<4 x i32> inreg %desc) { ; GFX12: ; %bb.0: ; GFX12-NEXT: s_mov_b32 s4, -8 ; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -8, i32 0) ret i32 %load @@ -961,7 +961,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_bit31(<4 x i32> inreg %desc) { ; GFX12: ; %bb.0: ; GFX12-NEXT: s_brev_b32 s4, 1 ; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -2147483648, i32 0) ret i32 %load @@ -1007,7 +1007,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_bit30(<4 x i32> inreg %desc) { ; GFX12: ; %bb.0: ; GFX12-NEXT: s_mov_b32 s4, 2.0 ; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1073741824, i32 0) ret i32 %load @@ -1053,7 +1053,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_bit29(<4 x i32> inreg %desc) { ; GFX12: ; %bb.0: ; GFX12-NEXT: s_brev_b32 s4, 4 ; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 536870912, i32 0) ret i32 %load @@ -1098,7 +1098,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_bit21(<4 x i32> inreg %desc) { ; GFX12-LABEL: s_buffer_load_imm_bit21: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x200000 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 2097152, i32 0) ret i32 %load @@ -1143,7 +1143,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_bit20(<4 x i32> inreg %desc) { ; GFX12-LABEL: s_buffer_load_imm_bit20: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x100000 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1048576, i32 0) ret i32 %load @@ -1189,7 +1189,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_neg_bit20(<4 x i32> inreg %desc) { ; GFX12: ; %bb.0: ; GFX12-NEXT: s_mov_b32 s4, 0xfff00000 ; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -1048576, i32 0) ret i32 %load @@ -1225,7 +1225,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_bit19(<4 x i32> inreg %desc) { ; GFX12-LABEL: s_buffer_load_imm_bit19: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x80000 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 524288, i32 0) ret i32 %load @@ -1271,7 +1271,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_neg_bit19(<4 x i32> inreg %desc) { ; GFX12: ; %bb.0: ; GFX12-NEXT: s_mov_b32 s4, 0xfff80000 ; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -524288, i32 0) ret i32 %load @@ -1308,7 +1308,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_255(<4 x i32> inreg %desc) { ; GFX12-LABEL: s_buffer_load_imm_255: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0xff -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 255, i32 0) ret i32 %load @@ -1336,7 +1336,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_256(<4 x i32> inreg %desc) { ; GFX12-LABEL: s_buffer_load_imm_256: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x100 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 256, i32 0) ret i32 %load @@ -1364,7 +1364,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_1016(<4 x i32> inreg %desc) { ; GFX12-LABEL: s_buffer_load_imm_1016: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x3f8 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1016, i32 0) ret i32 %load @@ -1392,7 +1392,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_1020(<4 x i32> inreg %desc) { ; GFX12-LABEL: s_buffer_load_imm_1020: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x3fc -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1020, i32 0) ret i32 %load @@ -1429,7 +1429,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_1021(<4 x i32> inreg %desc) { ; GFX12-LABEL: s_buffer_load_imm_1021: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x3fd -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1021, i32 0) ret i32 %load @@ -1465,7 +1465,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_1024(<4 x i32> inreg %desc) { ; GFX12-LABEL: s_buffer_load_imm_1024: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x400 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1024, i32 0) ret i32 %load @@ -1502,7 +1502,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_1025(<4 x i32> inreg %desc) { ; GFX12-LABEL: s_buffer_load_imm_1025: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x401 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1025, i32 0) ret i32 %load @@ -1538,7 +1538,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_1028(<4 x i32> inreg %desc) { ; GFX12-LABEL: s_buffer_load_imm_1028: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x400 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1024, i32 0) ret i32 %load diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll index 5ad7ddfbe5fe9d..4294cea9109f0b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll @@ -7,7 +7,11 @@ declare void @llvm.amdgcn.s.sleep.var(i32) define void @test_s_sleep_var1(i32 %arg) { ; GCN-LABEL: test_s_sleep_var1: ; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_expcnt 0x0 +; GCN-NEXT: s_wait_samplecnt 0x0 +; GCN-NEXT: s_wait_bvhcnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: s_sleep_var s0 @@ -19,7 +23,11 @@ define void @test_s_sleep_var1(i32 %arg) { define void @test_s_sleep_var2() { ; GCN-LABEL: test_s_sleep_var2: ; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_expcnt 0x0 +; GCN-NEXT: s_wait_samplecnt 0x0 +; GCN-NEXT: s_wait_bvhcnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_sleep_var 10 ; GCN-NEXT: s_setpc_b64 s[30:31] call void @llvm.amdgcn.s.sleep.var(i32 10) @@ -30,7 +38,7 @@ define amdgpu_kernel void @test_s_sleep_var3(i32 %arg) { ; GCN-LABEL: test_s_sleep_var3: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_sleep_var s0 ; GCN-NEXT: s_endpgm call void @llvm.amdgcn.s.sleep.var(i32 %arg) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll index d570dada4f1ab1..9dd11699e5dcf2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll @@ -41,7 +41,7 @@ define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> ; GFX12-NEXT: buffer_load_format_xyzw v[0:3], v8, s[0:3], null idxen ; GFX12-NEXT: buffer_load_format_xyzw v[4:7], v8, s[0:3], null idxen th:TH_LOAD_NT ; GFX12-NEXT: buffer_load_format_xyzw v[8:11], v8, s[0:3], null idxen th:TH_LOAD_HT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0) @@ -79,7 +79,7 @@ define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], null idxen offset:42 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 42, i32 0, i32 0) @@ -155,11 +155,11 @@ define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { ; GFX12-NEXT: buffer_load_format_xyzw v[0:3], v8, s[0:3], s4 idxen offset:4092 ; GFX12-NEXT: buffer_load_format_xyzw v[4:7], v8, s[0:3], s5 idxen offset:4092 ; GFX12-NEXT: s_mov_b32 s4, 0x8ffc -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v1, v1, v5 ; GFX12-NEXT: buffer_load_format_xyzw v[8:11], v8, s[0:3], s4 idxen offset:4 ; GFX12-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v3, v3, v7 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v1, v9, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_dual_add_f32 v0, v8, v0 :: v_dual_add_f32 v3, v11, v3 @@ -200,7 +200,7 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_12bit(<4 x i32> inreg) { ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], null idxen offset:4092 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4092, i32 0, i32 0) @@ -239,7 +239,7 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_13bit(<4 x i32> inreg) { ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], null idxen offset:8188 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 8188, i32 0, i32 0) @@ -278,7 +278,7 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_16bit(<4 x i32> inreg) { ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], null idxen offset:65532 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 65532, i32 0, i32 0) @@ -317,7 +317,7 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_23bit(<4 x i32> inreg) { ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], null idxen offset:8388604 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 8388604, i32 0, i32 0) @@ -356,14 +356,14 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_24bit(<4 x i32> inreg) { ; GFX12-SDAG: ; %bb.0: ; %main_body ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x800000 :: v_dual_mov_b32 v0, 0 ; GFX12-SDAG-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], null idxen offen offset:8388604 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: buffer_load_voffset_large_24bit: ; GFX12-GISEL: ; %bb.0: ; %main_body ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x800000 ; GFX12-GISEL-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], null idxen offen offset:8388604 -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 16777212, i32 0, i32 0) @@ -392,7 +392,7 @@ define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) { ; GFX12-LABEL: buffer_load_idx: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], null idxen -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0, i32 0) @@ -431,7 +431,7 @@ define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0 ; GFX12-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], null idxen offen -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %1, i32 0, i32 0) @@ -470,7 +470,7 @@ define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0 ; GFX12-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], null idxen offen offset:60 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %ofs = add i32 %1, 60 @@ -500,7 +500,7 @@ define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) { ; GFX12-LABEL: buffer_load_both: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], null idxen offen -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 %2, i32 0, i32 0) @@ -533,7 +533,7 @@ define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i3 ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-NEXT: buffer_load_format_xyzw v[0:3], v[1:2], s[0:3], null idxen offen -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 %2, i32 %1, i32 0, i32 0) @@ -566,7 +566,7 @@ define amdgpu_ps float @buffer_load_x(<4 x i32> inreg %rsrc) { ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: buffer_load_format_x v0, v0, s[0:3], null idxen -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) @@ -599,7 +599,7 @@ define amdgpu_ps float @buffer_load_x_i32(<4 x i32> inreg %rsrc) { ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: buffer_load_format_x v0, v0, s[0:3], null idxen -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) @@ -633,7 +633,7 @@ define amdgpu_ps <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: buffer_load_format_xy v[0:1], v0, s[0:3], null idxen -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) @@ -678,7 +678,7 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], null idxen tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX12-NEXT: v_mov_b32_e32 v0, v6 ; GFX12-NEXT: ; return to shader part epilog @@ -728,7 +728,7 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], null idxen tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX12-NEXT: v_mov_b32_e32 v0, v6 ; GFX12-NEXT: ; return to shader part epilog @@ -779,7 +779,7 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], null idxen tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX12-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-NEXT: ; return to shader part epilog @@ -830,7 +830,7 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], null idxen tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX12-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-NEXT: ; return to shader part epilog @@ -880,7 +880,7 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], null idxen tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: ; return to shader part epilog @@ -930,7 +930,7 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], null idxen tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: ; return to shader part epilog @@ -980,7 +980,7 @@ define amdgpu_cs float @buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrspace ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], null idxen tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: ; return to shader part epilog @@ -1030,7 +1030,7 @@ define amdgpu_cs float @buffer_load_f32_tfe(<4 x i32> inreg %rsrc, ptr addrspace ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], null idxen tfe -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll index f0a73feaa4a14f..4c1ae4c228adb3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll @@ -135,7 +135,7 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) { ; GFX12-NEXT: s_cbranch_execnz .LBB0_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0xffff, v6 ; GFX12-NEXT: ds_store_2addr_b32 v2, v0, v1 offset0:7 offset1:8 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll index be7c8de4e23e65..9be6b018a06412 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll @@ -39,7 +39,7 @@ define amdgpu_ps half @tbuffer_load_d16_x(<4 x i32> inreg %rsrc) { ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-PACKED-NEXT: tbuffer_load_d16_format_x v0, v0, s[0:3], null format:[BUF_FMT_32_FLOAT] idxen -; GFX12-PACKED-NEXT: s_waitcnt vmcnt(0) +; GFX12-PACKED-NEXT: s_wait_loadcnt 0x0 ; GFX12-PACKED-NEXT: ; return to shader part epilog main_body: %data = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 22, i32 0) @@ -83,7 +83,7 @@ define amdgpu_ps half @tbuffer_load_d16_xy(<4 x i32> inreg %rsrc) { ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-PACKED-NEXT: tbuffer_load_d16_format_xy v0, v0, s[0:3], null format:[BUF_FMT_32_FLOAT] idxen -; GFX12-PACKED-NEXT: s_waitcnt vmcnt(0) +; GFX12-PACKED-NEXT: s_wait_loadcnt 0x0 ; GFX12-PACKED-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-PACKED-NEXT: ; return to shader part epilog main_body: @@ -129,7 +129,7 @@ define amdgpu_ps half @tbuffer_load_d16_xyz(<4 x i32> inreg %rsrc) { ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-PACKED-NEXT: tbuffer_load_d16_format_xyz v[0:1], v0, s[0:3], null format:[BUF_FMT_32_FLOAT] idxen -; GFX12-PACKED-NEXT: s_waitcnt vmcnt(0) +; GFX12-PACKED-NEXT: s_wait_loadcnt 0x0 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-PACKED-NEXT: ; return to shader part epilog main_body: @@ -175,7 +175,7 @@ define amdgpu_ps half @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) { ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-PACKED-NEXT: tbuffer_load_d16_format_xyzw v[0:1], v0, s[0:3], null format:[BUF_FMT_32_FLOAT] idxen -; GFX12-PACKED-NEXT: s_waitcnt vmcnt(0) +; GFX12-PACKED-NEXT: s_wait_loadcnt 0x0 ; GFX12-PACKED-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX12-PACKED-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll index 1c66ebf9688ff0..8a1f6911032137 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll @@ -47,7 +47,7 @@ define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_l ; GFX12-NEXT: tbuffer_load_format_xyzw v[4:7], v12, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] idxen th:TH_LOAD_NT ; GFX12-NEXT: tbuffer_load_format_xyzw v[8:11], v12, s[0:3], null format:[BUF_FMT_32_FLOAT] idxen th:TH_LOAD_HT ; GFX12-NEXT: tbuffer_load_format_xyzw v[12:15], v12, s[0:3], null format:[BUF_FMT_32_FLOAT] idxen th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 78, i32 0) @@ -90,7 +90,7 @@ define amdgpu_vs <4 x float> @tbuffer_load_immoffs(<4 x i32> inreg) { ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], null format:78 idxen offset:42 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 42, i32 0, i32 78, i32 0) @@ -136,7 +136,7 @@ define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_l ; GFX12-NEXT: tbuffer_load_format_xyzw v[0:3], v8, s[0:3], s5 format:[BUF_FMT_8_8_8_8_SINT] idxen offset:4095 ; GFX12-NEXT: tbuffer_load_format_xyzw v[4:7], v8, s[0:3], s4 format:[BUF_FMT_32_32_32_32_SINT] idxen offset:73 ; GFX12-NEXT: tbuffer_load_format_xyzw v[8:11], v8, s[0:3], s4 format:77 idxen offset:1 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 4095, i32 61, i32 47, i32 0) %vdata_glc = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 73, i32 %soffs, i32 62, i32 0) @@ -172,7 +172,7 @@ define amdgpu_vs <4 x float> @tbuffer_load_idx(<4 x i32> inreg, i32 %vindex) { ; GFX12-LABEL: tbuffer_load_idx: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], null format:78 idxen -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 78, i32 0) @@ -212,7 +212,7 @@ define amdgpu_vs <4 x float> @tbuffer_load_ofs(<4 x i32> inreg, i32 %voffs) { ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0 ; GFX12-NEXT: tbuffer_load_format_xyzw v[0:3], v[0:1], s[0:3], null format:78 idxen offen -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %voffs, i32 0, i32 78, i32 0) @@ -252,7 +252,7 @@ define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(<4 x i32> inreg, i32 %voffs) ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0 ; GFX12-NEXT: tbuffer_load_format_xyzw v[0:3], v[0:1], s[0:3], null format:78 idxen offen offset:52 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %ofs = add i32 %voffs, 52 @@ -283,7 +283,7 @@ define amdgpu_vs <4 x float> @tbuffer_load_both(<4 x i32> inreg, i32 %vindex, i3 ; GFX12-LABEL: tbuffer_load_both: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: tbuffer_load_format_xyzw v[0:3], v[0:1], s[0:3], null format:78 idxen offen -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 %voffs, i32 0, i32 78, i32 0) @@ -317,7 +317,7 @@ define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: tbuffer_load_format_xy v[0:1], v0, s[0:3], null format:77 idxen -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %vdata = call <2 x i32> @llvm.amdgcn.struct.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 77, i32 0) %vdata.f = bitcast <2 x i32> %vdata to <2 x float> @@ -350,7 +350,7 @@ define amdgpu_vs float @buffer_load_x(<4 x i32> inreg %rsrc) { ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: tbuffer_load_format_x v0, v0, s[0:3], null format:77 idxen -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %vdata = call i32 @llvm.amdgcn.struct.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 77, i32 0) %vdata.f = bitcast i32 %vdata to float @@ -383,7 +383,7 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_12bit(<4 x i32> inreg) { ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] idxen offset:4092 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 4092, i32 0, i32 63, i32 0) @@ -422,7 +422,7 @@ define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_13bit(<4 x i32> inreg) ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] idxen offset:8188 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 8188, i32 0, i32 63, i32 0) @@ -461,7 +461,7 @@ define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_16bit(<4 x i32> inreg) ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] idxen offset:65532 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 65532, i32 0, i32 63, i32 0) @@ -500,7 +500,7 @@ define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_23bit(<4 x i32> inreg) ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] idxen offset:8388604 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 8388604, i32 0, i32 63, i32 0) @@ -539,14 +539,14 @@ define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_24bit(<4 x i32> inreg) ; GFX12-SDAG: ; %bb.0: ; %main_body ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x800000 :: v_dual_mov_b32 v0, 0 ; GFX12-SDAG-NEXT: tbuffer_load_format_xyzw v[0:3], v[0:1], s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] idxen offen offset:8388604 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: tbuffer_load_voffset_large_24bit: ; GFX12-GISEL: ; %bb.0: ; %main_body ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x800000 ; GFX12-GISEL-NEXT: tbuffer_load_format_xyzw v[0:3], v[0:1], s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] idxen offen offset:8388604 -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 16777212, i32 0, i32 63, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll index 22da2d0b023269..22ec22dc2db024 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll @@ -58,7 +58,7 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 ; GFX12-PACKED-NEXT: s_clause 0x1 ; GFX12-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 ; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX12-PACKED-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX12-PACKED-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen @@ -123,7 +123,7 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %dat ; GFX12-PACKED-NEXT: s_clause 0x1 ; GFX12-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 ; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX12-PACKED-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX12-PACKED-NEXT: tbuffer_store_d16_format_xy v0, v1, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen @@ -200,7 +200,7 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; GFX12-PACKED-SDAG-NEXT: s_clause 0x1 ; GFX12-PACKED-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x10 ; GFX12-PACKED-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX12-PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-PACKED-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-SDAG-NEXT: s_and_b32 s5, s5, 0xffff ; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v1, s5 @@ -215,7 +215,7 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; GFX12-PACKED-GISEL-NEXT: s_clause 0x1 ; GFX12-PACKED-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x10 ; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX12-PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-PACKED-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-GISEL-NEXT: s_pack_lh_b32_b16 s4, s4, s4 ; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v0, s4 @@ -294,7 +294,7 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d ; GFX12-PACKED-NEXT: s_clause 0x1 ; GFX12-PACKED-NEXT: s_load_b96 s[4:6], s[0:1], 0x10 ; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX12-PACKED-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v2, s6 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll index 9bbabce5b22076..85fc76eb182a3b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll @@ -305,7 +305,7 @@ define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float> %vdata, i3 ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] idxen ; GFX12-NEXT: buffer_load_format_xyzw v[0:3], v5, s[0:3], null idxen -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], null format:[BUF_FMT_8_8_8_8_UINT] idxen ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll index d299e760b87740..d5e62896232e25 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll @@ -58,10 +58,10 @@ define amdgpu_kernel void @constant_load_v8f32(ptr addrspace(4) noalias nocaptur ; GFX12-LABEL: constant_load_v8f32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s12, s[10:11], 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[8:9], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_f32 s0, s0, s12 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3) ; GFX12-NEXT: s_add_f32 s0, s1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll index d00044c6ac1ab8..cfaefca3a516d7 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll @@ -48,10 +48,10 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac ; GFX12-LABEL: constant_load_f64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 @@ -138,10 +138,10 @@ define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocaptu ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[20:21], s[18:19], 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[16:17], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_f64_e64 v[0:1], s[0:1], s[20:21] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e32 v[0:1], s[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 4ed4034a0348f4..cef0c8733d9912 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -67,9 +67,9 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX12-NEXT: global_store_b8 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -141,9 +141,9 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -213,9 +213,9 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -286,9 +286,9 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -359,9 +359,9 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -432,9 +432,9 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -488,9 +488,9 @@ define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrsp ; GFX12-LABEL: constant_load_v32i1: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -547,10 +547,10 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp ; GFX12-LABEL: constant_load_v64i1: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 @@ -612,9 +612,9 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -679,9 +679,9 @@ define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, pt ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 1 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -744,9 +744,9 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -811,9 +811,9 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 1 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -883,9 +883,9 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v2, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0 ; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -960,9 +960,9 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v2, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0 ; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -1045,9 +1045,9 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v3, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v3, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0 ; GFX12-NEXT: v_lshrrev_b16 v2, 2, v0 ; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 @@ -1136,9 +1136,9 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v3, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v3, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v1, 2, v0 ; GFX12-NEXT: v_lshrrev_b16 v4, 1, v0 ; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1 @@ -1225,9 +1225,9 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v4, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v1, 2, v0 ; GFX12-NEXT: v_lshrrev_b16 v2, 1, v0 ; GFX12-NEXT: v_lshrrev_b16 v3, 3, v0 @@ -1322,9 +1322,9 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v4, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v1, 3, v0 ; GFX12-NEXT: v_lshrrev_b16 v2, 2, v0 ; GFX12-NEXT: v_lshrrev_b16 v5, 1, v0 @@ -1442,9 +1442,9 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v8, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v2, 5, v0 ; GFX12-NEXT: v_lshrrev_b16 v5, 1, v0 ; GFX12-NEXT: v_lshrrev_b16 v6, 3, v0 @@ -1577,9 +1577,9 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v8, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v1, 4, v0 ; GFX12-NEXT: v_lshrrev_b16 v4, 5, v0 ; GFX12-NEXT: v_lshrrev_b16 v5, 6, v0 @@ -1766,9 +1766,9 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v16, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v16, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v2, 13, v0 ; GFX12-NEXT: v_lshrrev_b16 v13, 1, v0 ; GFX12-NEXT: v_lshrrev_b16 v15, 3, v0 @@ -1989,9 +1989,9 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v16, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v16, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v1, 12, v0 ; GFX12-NEXT: v_lshrrev_b16 v4, 13, v0 ; GFX12-NEXT: v_lshrrev_b16 v8, 14, v0 @@ -2347,9 +2347,9 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o ; GFX12-LABEL: constant_zextload_v32i1_to_v32i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshr_b32 s3, s2, 24 ; GFX12-NEXT: v_lshrrev_b16 v1, 13, s2 ; GFX12-NEXT: v_lshrrev_b16 v2, 9, s2 @@ -2768,9 +2768,9 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o ; GFX12-LABEL: constant_sextload_v32i1_to_v32i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v0, 12, s2 ; GFX12-NEXT: v_lshrrev_b16 v12, 13, s2 ; GFX12-NEXT: v_lshrrev_b16 v13, 14, s2 @@ -3442,9 +3442,9 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX12-LABEL: constant_zextload_v64i1_to_v64i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v2, 13, s2 ; GFX12-NEXT: s_lshr_b32 s4, s3, 24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 @@ -4242,9 +4242,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX12-LABEL: constant_sextload_v64i1_to_v64i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshr_b32 s5, s2, 24 ; GFX12-NEXT: v_lshrrev_b16 v28, 12, s3 ; GFX12-NEXT: v_lshrrev_b16 v29, 13, s3 @@ -4454,9 +4454,9 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 @@ -4525,9 +4525,9 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v2, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -4597,9 +4597,9 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 @@ -4668,9 +4668,9 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v2, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -4748,9 +4748,9 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v2, 1, v0 ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 1, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -4832,9 +4832,9 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v4, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0 ; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -4934,9 +4934,9 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v5, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v5, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v2, 1, v0 ; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0 ; GFX12-NEXT: v_lshrrev_b16 v3, 2, v0 @@ -5044,9 +5044,9 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v6, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v6, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v1, 2, v0 ; GFX12-NEXT: v_lshrrev_b16 v2, 1, v0 ; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1 @@ -5161,9 +5161,9 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v6, 1, v0 ; GFX12-NEXT: v_lshrrev_b16 v2, 2, v0 ; GFX12-NEXT: v_lshrrev_b16 v4, 1, v0 @@ -5285,9 +5285,9 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v8, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v1, 3, v0 ; GFX12-NEXT: v_lshrrev_b16 v2, 2, v0 ; GFX12-NEXT: v_lshrrev_b16 v3, 1, v0 @@ -5456,9 +5456,9 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v12, 1, v0 ; GFX12-NEXT: v_lshrrev_b16 v4, 5, v0 ; GFX12-NEXT: v_lshrrev_b16 v8, 3, v0 @@ -5645,9 +5645,9 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v16, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v16, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v3, 6, v1 ; GFX12-NEXT: v_lshrrev_b16 v5, 7, v1 ; GFX12-NEXT: v_lshrrev_b16 v7, 4, v1 @@ -5933,9 +5933,9 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v1, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v28, 1, v0 ; GFX12-NEXT: v_lshrrev_b16 v4, 11, v0 ; GFX12-NEXT: v_lshrrev_b16 v8, 9, v0 @@ -6266,9 +6266,9 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v32, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v32, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v3, 14, v1 ; GFX12-NEXT: v_lshrrev_b16 v5, 15, v1 ; GFX12-NEXT: v_lshrrev_b16 v7, 12, v1 @@ -6781,9 +6781,9 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX12-LABEL: constant_zextload_v32i1_to_v32i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v0, 13, s2 ; GFX12-NEXT: v_lshrrev_b16 v3, 11, s2 ; GFX12-NEXT: s_lshr_b32 s3, s2, 24 @@ -7446,9 +7446,9 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX12-LABEL: constant_sextload_v32i1_to_v32i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v26, 6, s2 ; GFX12-NEXT: v_lshrrev_b16 v28, 7, s2 ; GFX12-NEXT: v_lshrrev_b16 v4, 2, s2 @@ -8432,9 +8432,9 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-LABEL: constant_zextload_v64i1_to_v64i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v4, 11, s2 ; GFX12-NEXT: v_lshrrev_b16 v6, 7, s2 ; GFX12-NEXT: s_lshr_b32 s4, s3, 24 @@ -9725,9 +9725,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_mov_b32 s19, s5 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshr_b32 s26, s3, 22 ; GFX12-NEXT: s_lshr_b32 s28, s3, 23 ; GFX12-NEXT: s_lshr_b32 s30, s3, 20 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 585f96b9ffb2e6..f48a693315671c 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -79,9 +79,9 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -148,9 +148,9 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp ; GFX12-LABEL: constant_load_v2i16: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -253,9 +253,9 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp ; GFX12-LABEL: constant_load_v3i16: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-NEXT: s_clause 0x1 @@ -329,10 +329,10 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp ; GFX12-LABEL: constant_load_v4i16: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 @@ -409,10 +409,10 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp ; GFX12-LABEL: constant_load_v8i16: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] @@ -523,9 +523,9 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs ; GFX12-LABEL: constant_load_v16i16: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v5, s1 @@ -744,7 +744,7 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0xf ; GFX12-NEXT: global_load_u16 v3, v8, s[0:1] offset:28 ; GFX12-NEXT: global_load_u16 v2, v8, s[0:1] offset:24 @@ -762,9 +762,9 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GFX12-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:10 ; GFX12-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:6 ; GFX12-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:2 -; GFX12-NEXT: s_waitcnt vmcnt(4) +; GFX12-NEXT: s_wait_loadcnt 0x4 ; GFX12-NEXT: global_store_b128 v[0:1], v[0:3], off -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b128 v[0:1], v[4:7], off ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -839,9 +839,9 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -917,9 +917,9 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_i16 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -994,9 +994,9 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1072,9 +1072,9 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_i16 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1153,9 +1153,9 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; GFX12-LABEL: constant_zextload_v2i16_to_v2i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_and_b32 s3, s2, 0xffff ; GFX12-NEXT: s_lshr_b32 s2, s2, 16 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -1241,9 +1241,9 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; GFX12-LABEL: constant_sextload_v2i16_to_v2i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_sext_i32_i16 s3, s2 ; GFX12-NEXT: s_ashr_i32 s2, s2, 16 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -1339,9 +1339,9 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; GFX12-LABEL: constant_zextload_v3i16_to_v3i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_and_b32 s3, s3, 0xffff ; GFX12-NEXT: s_and_b32 s4, s2, 0xffff ; GFX12-NEXT: s_lshr_b32 s2, s2, 16 @@ -1441,9 +1441,9 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; GFX12-LABEL: constant_sextload_v3i16_to_v3i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_ashr_i32 s4, s2, 16 ; GFX12-NEXT: s_sext_i32_i16 s2, s2 ; GFX12-NEXT: s_sext_i32_i16 s3, s3 @@ -1546,9 +1546,9 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; GFX12-LABEL: constant_zextload_v4i16_to_v4i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshr_b32 s4, s3, 16 ; GFX12-NEXT: s_and_b32 s3, s3, 0xffff ; GFX12-NEXT: s_and_b32 s5, s2, 0xffff @@ -1656,9 +1656,9 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; GFX12-LABEL: constant_sextload_v4i16_to_v4i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_ashr_i32 s4, s3, 16 ; GFX12-NEXT: s_ashr_i32 s5, s2, 16 ; GFX12-NEXT: s_sext_i32_i16 s2, s2 @@ -1808,9 +1808,9 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; GFX12-LABEL: constant_zextload_v8i16_to_v8i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshr_b32 s8, s7, 16 ; GFX12-NEXT: s_and_b32 s7, s7, 0xffff ; GFX12-NEXT: s_and_b32 s9, s6, 0xffff @@ -1970,9 +1970,9 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; GFX12-LABEL: constant_sextload_v8i16_to_v8i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_ashr_i32 s8, s7, 16 ; GFX12-NEXT: s_ashr_i32 s9, s6, 16 ; GFX12-NEXT: s_sext_i32_i16 s6, s6 @@ -2220,9 +2220,9 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) % ; GFX12-LABEL: constant_zextload_v16i16_to_v16i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshr_b32 s16, s11, 16 ; GFX12-NEXT: s_and_b32 s11, s11, 0xffff ; GFX12-NEXT: s_and_b32 s17, s10, 0xffff @@ -2488,9 +2488,9 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) % ; GFX12-LABEL: constant_sextload_v16i16_to_v16i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_ashr_i32 s16, s11, 16 ; GFX12-NEXT: s_ashr_i32 s17, s10, 16 ; GFX12-NEXT: s_sext_i32_i16 s10, s10 @@ -2938,9 +2938,9 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; GFX12-LABEL: constant_zextload_v32i16_to_v32i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshr_b32 s33, s15, 16 ; GFX12-NEXT: s_and_b32 s15, s15, 0xffff ; GFX12-NEXT: s_and_b32 s34, s14, 0xffff @@ -3428,9 +3428,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; GFX12-LABEL: constant_sextload_v32i16_to_v32i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_ashr_i32 s33, s15, 16 ; GFX12-NEXT: s_ashr_i32 s34, s14, 16 ; GFX12-NEXT: s_sext_i32_i16 s14, s14 @@ -4291,11 +4291,11 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GFX12-LABEL: constant_zextload_v64i16_to_v64i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[38:39], 0x40 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshr_b32 s49, s31, 16 ; GFX12-NEXT: s_lshr_b32 s65, s15, 16 ; GFX12-NEXT: s_lshr_b32 s66, s14, 16 @@ -5230,11 +5230,11 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GFX12-LABEL: constant_sextload_v64i16_to_v64i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b512 s[0:15], s[38:39], 0x40 ; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_ashr_i32 s65, s15, 16 ; GFX12-NEXT: s_ashr_i32 s66, s14, 16 ; GFX12-NEXT: s_sext_i32_i16 s14, s14 @@ -5431,9 +5431,9 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v1, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 @@ -5521,9 +5521,9 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v2, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -5606,9 +5606,9 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v1, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 @@ -5691,9 +5691,9 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v2, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -5783,9 +5783,9 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; GFX12-LABEL: constant_zextload_v2i16_to_v2i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_and_b32 s3, 0xffff, s2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 @@ -5883,9 +5883,9 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; GFX12-LABEL: constant_sextload_v2i16_to_v2i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshr_b32 s4, s2, 16 ; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 ; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 @@ -6010,9 +6010,9 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; GFX12-LABEL: constant_zextload_v4i16_to_v4i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_and_b32 s4, 0xffff, s2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 @@ -6157,9 +6157,9 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; GFX12-LABEL: constant_sextload_v4i16_to_v4i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s6, s3 ; GFX12-NEXT: s_lshr_b32 s8, s3, 16 ; GFX12-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x100000 @@ -6358,9 +6358,9 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GFX12-LABEL: constant_zextload_v8i16_to_v8i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_and_b32 s2, 0xffff, s7 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 @@ -6599,9 +6599,9 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GFX12-LABEL: constant_sextload_v8i16_to_v8i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s14, s7 ; GFX12-NEXT: s_lshr_b32 s16, s7, 16 ; GFX12-NEXT: s_bfe_i64 s[12:13], s[6:7], 0x100000 @@ -6946,9 +6946,9 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % ; GFX12-LABEL: constant_zextload_v16i16_to_v16i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_and_b32 s10, s5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s10 @@ -7382,9 +7382,9 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GFX12-LABEL: constant_sextload_v16i16_to_v16i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s30, s5 ; GFX12-NEXT: s_lshr_b32 s34, s5, 16 ; GFX12-NEXT: s_bfe_i64 s[28:29], s[4:5], 0x100000 @@ -8027,9 +8027,9 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; GFX12-LABEL: constant_zextload_v32i16_to_v32i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_and_b32 s18, s15, 0xffff ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s18 @@ -8855,9 +8855,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GFX12-LABEL: constant_sextload_v32i16_to_v32i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshr_b32 s28, s2, 16 ; GFX12-NEXT: s_lshr_b32 s42, s5, 16 ; GFX12-NEXT: s_lshr_b32 s52, s8, 16 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll index 16f95409055b19..96ceba8ae3b66e 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll @@ -74,9 +74,9 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac ; GFX12-LABEL: constant_load_i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -159,10 +159,10 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp ; GFX12-LABEL: constant_load_v2i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 @@ -256,9 +256,9 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp ; GFX12-LABEL: constant_load_v3i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b96 s[4:6], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4 ; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 ; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1] @@ -350,10 +350,10 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp ; GFX12-LABEL: constant_load_v4i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] @@ -478,9 +478,9 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp ; GFX12-LABEL: constant_load_v8i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v5, s1 @@ -637,11 +637,11 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp ; GFX12-LABEL: constant_load_v9i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s12, s[10:11], 0x20 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v9, s12 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -804,12 +804,12 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs ; GFX12-LABEL: constant_load_v10i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[12:13], s[10:11], 0x20 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v10, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v0, s4 ; GFX12-NEXT: v_dual_mov_b32 v3, s7 :: v_dual_mov_b32 v2, s6 @@ -983,11 +983,11 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; GFX12-LABEL: constant_load_v11i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b96 s[12:14], s[10:11], 0x20 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v8, s12 ; GFX12-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v10, s14 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -1159,12 +1159,12 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs ; GFX12-LABEL: constant_load_v12i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[12:15], s[10:11], 0x20 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v12, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v3, s15 ; GFX12-NEXT: v_dual_mov_b32 v1, s13 :: v_dual_mov_b32 v2, s14 ; GFX12-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, s4 @@ -1366,9 +1366,9 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs ; GFX12-LABEL: constant_load_v16i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s13 ; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v3, s15 ; GFX12-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v5, s9 @@ -1463,9 +1463,9 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p ; GFX12-LABEL: constant_zextload_i32_to_i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1553,9 +1553,9 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p ; GFX12-LABEL: constant_sextload_i32_to_i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_ashr_i32 s3, s2, 31 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 @@ -1641,9 +1641,9 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; GFX12-LABEL: constant_zextload_v1i32_to_v1i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1731,9 +1731,9 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; GFX12-LABEL: constant_sextload_v1i32_to_v1i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_ashr_i32 s3, s2, 31 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 @@ -1830,9 +1830,9 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; GFX12-LABEL: constant_zextload_v2i32_to_v2i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1 @@ -1941,9 +1941,9 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; GFX12-LABEL: constant_sextload_v2i32_to_v2i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_ashr_i32 s4, s3, 31 ; GFX12-NEXT: s_ashr_i32 s5, s2, 31 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -2071,9 +2071,9 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; GFX12-LABEL: constant_zextload_v4i32_to_v4i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, s7 @@ -2227,9 +2227,9 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; GFX12-LABEL: constant_sextload_v4i32_to_v4i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_ashr_i32 s8, s7, 31 ; GFX12-NEXT: s_ashr_i32 s9, s6, 31 ; GFX12-NEXT: s_ashr_i32 s2, s5, 31 @@ -2425,9 +2425,9 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; GFX12-LABEL: constant_zextload_v8i32_to_v8i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, s7 @@ -2684,9 +2684,9 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; GFX12-LABEL: constant_sextload_v8i32_to_v8i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_ashr_i32 s16, s11, 31 ; GFX12-NEXT: s_ashr_i32 s17, s10, 31 ; GFX12-NEXT: s_ashr_i32 s14, s9, 31 @@ -3143,9 +3143,9 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX12-LABEL: constant_sextload_v16i32_to_v16i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_ashr_i32 s28, s11, 31 ; GFX12-NEXT: s_ashr_i32 s29, s10, 31 ; GFX12-NEXT: s_ashr_i32 s33, s15, 31 @@ -3504,9 +3504,9 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX12-LABEL: constant_zextload_v16i32_to_v16i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s14 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, s15 @@ -4356,11 +4356,11 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX12-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b512 s[0:15], s[38:39], 0x0 ; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x40 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_ashr_i32 s49, s15, 31 ; GFX12-NEXT: s_ashr_i32 s64, s31, 31 ; GFX12-NEXT: s_ashr_i32 s65, s30, 31 @@ -5032,11 +5032,11 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX12-LABEL: constant_zextload_v32i32_to_v32i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x40 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_load_b512 s[0:15], s[38:39], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, s30 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: v_mov_b32_e32 v2, s31 @@ -5427,11 +5427,11 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; GFX12-LABEL: constant_load_v32i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x40 ; GFX12-NEXT: s_load_b512 s[0:15], s[38:39], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v1, s29 ; GFX12-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v3, s31 ; GFX12-NEXT: v_dual_mov_b32 v2, s30 :: v_dual_mov_b32 v5, s25 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll index 9b3830671acbd6..e1b7d67b208df6 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll @@ -64,10 +64,10 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac ; GFX12-LABEL: constant_load_i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 @@ -143,10 +143,10 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp ; GFX12-LABEL: constant_load_v2i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] @@ -254,11 +254,11 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp ; GFX12-LABEL: constant_load_v3i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[8:9], s[2:3], 0x10 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, s9 ; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 @@ -373,9 +373,9 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp ; GFX12-LABEL: constant_load_v4i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v5, s1 @@ -559,9 +559,9 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp ; GFX12-LABEL: constant_load_v8i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s13 ; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v3, s15 ; GFX12-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v5, s9 @@ -900,11 +900,11 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; GFX12-LABEL: constant_load_v16i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x40 ; GFX12-NEXT: s_load_b512 s[0:15], s[38:39], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v1, s29 ; GFX12-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v3, s31 ; GFX12-NEXT: v_dual_mov_b32 v2, s30 :: v_dual_mov_b32 v5, s25 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index f18a34515a8265..3c819c12434707 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -80,9 +80,9 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -167,9 +167,9 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -279,9 +279,9 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa ; GFX12-LABEL: constant_load_v3i8: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_d16_hi_b8 v0, v1, s[0:1] offset:2 @@ -351,9 +351,9 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa ; GFX12-LABEL: constant_load_v4i8: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -424,10 +424,10 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa ; GFX12-LABEL: constant_load_v8i8: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 @@ -504,10 +504,10 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp ; GFX12-LABEL: constant_load_v16i8: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] @@ -584,9 +584,9 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -662,9 +662,9 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_i8 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -739,9 +739,9 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -817,9 +817,9 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_i8 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -913,9 +913,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v2, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v1, 8, v0 ; GFX12-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -1013,9 +1013,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v2, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v1, 8, v0 ; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -1113,9 +1113,9 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out ; GFX12-LABEL: constant_zextload_v3i8_to_v3i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2 ; GFX12-NEXT: s_and_b32 s3, s2, 0xff ; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010 @@ -1217,9 +1217,9 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out ; GFX12-LABEL: constant_sextload_v3i8_to_v3i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2 ; GFX12-NEXT: s_sext_i32_i8 s3, s2 ; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80010 @@ -1323,9 +1323,9 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out ; GFX12-LABEL: constant_zextload_v4i8_to_v4i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2 ; GFX12-NEXT: s_lshr_b32 s3, s2, 24 ; GFX12-NEXT: s_and_b32 s4, s2, 0xff @@ -1432,9 +1432,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out ; GFX12-LABEL: constant_sextload_v4i8_to_v4i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2 ; GFX12-NEXT: s_ashr_i32 s3, s2, 24 ; GFX12-NEXT: s_sext_i32_i8 s4, s2 @@ -1584,9 +1584,9 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out ; GFX12-LABEL: constant_zextload_v8i8_to_v8i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2 ; GFX12-NEXT: v_lshrrev_b16 v5, 8, s3 ; GFX12-NEXT: s_lshr_b32 s5, s2, 24 @@ -1748,9 +1748,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out ; GFX12-LABEL: constant_sextload_v8i8_to_v8i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2 ; GFX12-NEXT: v_lshrrev_b16 v5, 8, s3 ; GFX12-NEXT: s_ashr_i32 s6, s2, 24 @@ -2000,9 +2000,9 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o ; GFX12-LABEL: constant_zextload_v16i8_to_v16i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v1, 8, s7 ; GFX12-NEXT: v_lshrrev_b16 v5, 8, s6 ; GFX12-NEXT: v_lshrrev_b16 v9, 8, s5 @@ -2276,9 +2276,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; GFX12-LABEL: constant_sextload_v16i8_to_v16i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v1, 8, s7 ; GFX12-NEXT: v_lshrrev_b16 v5, 8, s6 ; GFX12-NEXT: v_lshrrev_b16 v9, 8, s5 @@ -2725,9 +2725,9 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX12-LABEL: constant_zextload_v32i8_to_v32i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v1, 8, s11 ; GFX12-NEXT: v_lshrrev_b16 v5, 8, s10 ; GFX12-NEXT: v_lshrrev_b16 v9, 8, s9 @@ -3232,9 +3232,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX12-LABEL: constant_sextload_v32i8_to_v32i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v1, 8, s11 ; GFX12-NEXT: v_lshrrev_b16 v9, 8, s9 ; GFX12-NEXT: v_lshrrev_b16 v10, 8, s8 @@ -4087,9 +4087,9 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX12-LABEL: constant_zextload_v64i8_to_v64i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v0, 8, s15 ; GFX12-NEXT: v_lshrrev_b16 v9, 8, s2 ; GFX12-NEXT: v_lshrrev_b16 v5, 8, s1 @@ -5052,9 +5052,9 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX12-LABEL: constant_sextload_v64i8_to_v64i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v0, 8, s15 ; GFX12-NEXT: s_ashr_i32 s49, s15, 24 ; GFX12-NEXT: s_bfe_i32 s50, s15, 0x80010 @@ -5268,9 +5268,9 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 @@ -5354,9 +5354,9 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_i8 v0, v2, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -5438,9 +5438,9 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -5523,9 +5523,9 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_i8 v0, v2, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -5629,9 +5629,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v1, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v2, 8, v0 ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -5740,9 +5740,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v4, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v1, 8, v0 ; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -5871,9 +5871,9 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out ; GFX12-LABEL: constant_zextload_v4i8_to_v4i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v4, 8, s2 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x80010 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -6022,9 +6022,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out ; GFX12-LABEL: constant_sextload_v4i8_to_v4i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2 ; GFX12-NEXT: s_lshr_b32 s4, s2, 16 ; GFX12-NEXT: s_lshr_b32 s6, s2, 24 @@ -6228,9 +6228,9 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX12-LABEL: constant_zextload_v8i8_to_v8i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_bfe_u32 s4, s3, 0x80010 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 @@ -6481,9 +6481,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v6, 8, s2 ; GFX12-NEXT: v_lshrrev_b16 v7, 8, s3 ; GFX12-NEXT: s_lshr_b32 s6, s3, 16 @@ -6834,9 +6834,9 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX12-LABEL: constant_zextload_v16i8_to_v16i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_bfe_u32 s2, s7, 0x80010 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 @@ -7283,9 +7283,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX12-LABEL: constant_sextload_v16i8_to_v16i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v10, 8, s7 ; GFX12-NEXT: v_lshrrev_b16 v11, 8, s6 ; GFX12-NEXT: v_lshrrev_b16 v21, 8, s5 @@ -7942,9 +7942,9 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX12-LABEL: constant_zextload_v32i8_to_v32i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_bfe_u32 s10, s7, 0x80010 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s10 @@ -8801,9 +8801,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX12-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b16 v1, 8, s7 ; GFX12-NEXT: v_lshrrev_b16 v2, 8, s6 ; GFX12-NEXT: v_lshrrev_b16 v4, 8, s5 @@ -9010,9 +9010,9 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -9098,9 +9098,9 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_i8 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -9184,9 +9184,9 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -9272,9 +9272,9 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_i8 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -9366,9 +9366,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX12-NEXT: v_lshrrev_b16 v1, 8, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -9479,9 +9479,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_bfe_i32 v2, v1, 0, 8 ; GFX12-NEXT: v_ashrrev_i16 v1, 8, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -9608,9 +9608,9 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v3, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshr_b32 s3, s2, 16 ; GFX12-NEXT: v_and_b32_e64 v0, 0xff, s2 ; GFX12-NEXT: v_and_b32_e64 v1, 0xff, s3 @@ -9758,9 +9758,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out ; GFX12-LABEL: constant_sextload_v4i8_to_v4i16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_bfe_i32 s4, s2, 0x80000 ; GFX12-NEXT: s_lshr_b32 s3, s2, 16 ; GFX12-NEXT: v_ashrrev_i16 v0, 8, s2 @@ -9949,9 +9949,9 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX12-LABEL: constant_zextload_v8i8_to_v8i16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshr_b32 s5, s2, 16 ; GFX12-NEXT: s_lshr_b32 s6, s3, 16 ; GFX12-NEXT: v_and_b32_e64 v0, 0xff, s2 @@ -10180,9 +10180,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX12-LABEL: constant_sextload_v8i8_to_v8i16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_bfe_i32 s8, s2, 0x80000 ; GFX12-NEXT: s_bfe_i32 s9, s3, 0x80000 ; GFX12-NEXT: s_lshr_b32 s6, s2, 16 @@ -10510,9 +10510,9 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX12-LABEL: constant_zextload_v16i8_to_v16i16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshr_b32 s3, s6, 16 ; GFX12-NEXT: s_lshr_b32 s9, s7, 16 ; GFX12-NEXT: s_lshr_b32 s11, s4, 16 @@ -10921,9 +10921,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX12-LABEL: constant_sextload_v16i8_to_v16i16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshr_b32 s8, s6, 16 ; GFX12-NEXT: v_ashrrev_i16 v5, 8, s6 ; GFX12-NEXT: s_bfe_i32 s6, s6, 0x80000 @@ -11532,9 +11532,9 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX12-LABEL: constant_zextload_v32i8_to_v32i16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshr_b32 s25, s1, 16 ; GFX12-NEXT: s_lshr_b32 s21, s3, 16 ; GFX12-NEXT: s_lshr_b32 s23, s0, 16 @@ -12306,9 +12306,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX12-LABEL: constant_sextload_v32i8_to_v32i16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshr_b32 s12, s4, 16 ; GFX12-NEXT: s_lshr_b32 s14, s2, 16 ; GFX12-NEXT: v_ashrrev_i16 v4, 8, s2 diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll index fb3c04235b8e4d..0dbce3b6b5fe6f 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll @@ -5,12 +5,12 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s ; GCN-LABEL: copy_flat: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_b32 s4, s[0:1], 0x34 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_cmp_eq_u32 s4, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_3 ; GCN-NEXT: ; %bb.1: ; %for.body.preheader ; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 ; GCN-NEXT: .p2align 6 ; GCN-NEXT: .LBB0_2: ; %for.body @@ -24,7 +24,7 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s ; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 ; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-NEXT: flat_store_b128 v[4:5], v[0:3] ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN-NEXT: .LBB0_3: ; %for.end @@ -52,13 +52,13 @@ define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrsp ; GCN-LABEL: copy_global: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_b32 s4, s[0:1], 0x34 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_cmp_eq_u32 s4, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB1_3 ; GCN-NEXT: ; %bb.1: ; %for.body.preheader ; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 ; GCN-NEXT: .LBB1_2: ; %for.body ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -67,7 +67,7 @@ define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrsp ; GCN-NEXT: s_add_co_i32 s4, s4, -1 ; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 ; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: global_store_b128 v0, v[1:4], s[0:1] ; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 ; GCN-NEXT: s_cbranch_scc1 .LBB1_2 @@ -98,7 +98,7 @@ define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addr ; GCN-LABEL: copy_constant: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_b32 s4, s[0:1], 0x34 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_cmp_eq_u32 s4, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB2_3 ; GCN-NEXT: ; %bb.1: ; %for.body.preheader @@ -106,13 +106,13 @@ define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addr ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: .LBB2_2: ; %for.body ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_load_b128 s[8:11], s[2:3], 0x0 ; GCN-NEXT: s_prefetch_data s[2:3], 0xb0, null, 0 ; GCN-NEXT: s_add_co_i32 s4, s4, -1 ; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 ; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s9 ; GCN-NEXT: v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11 ; GCN-NEXT: global_store_b128 v0, v[1:4], s[0:1] @@ -145,7 +145,7 @@ define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspa ; GCN-LABEL: copy_local: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_cmp_eq_u32 s2, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB3_2 ; GCN-NEXT: .LBB3_1: ; %for.body @@ -158,9 +158,9 @@ define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspa ; GCN-NEXT: ds_load_2addr_b32 v[0:1], v2 offset0:2 offset1:3 ; GCN-NEXT: ds_load_2addr_b32 v[2:3], v2 offset1:1 ; GCN-NEXT: s_cmp_lg_u32 s2, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(1) +; GCN-NEXT: s_wait_dscnt 0x1 ; GCN-NEXT: ds_store_2addr_b32 v4, v0, v1 offset0:2 offset1:3 -; GCN-NEXT: s_waitcnt lgkmcnt(1) +; GCN-NEXT: s_wait_dscnt 0x1 ; GCN-NEXT: ds_store_2addr_b32 v4, v2, v3 offset1:1 ; GCN-NEXT: s_cbranch_scc1 .LBB3_1 ; GCN-NEXT: .LBB3_2: ; %for.end diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll index 047cb3ab400084..b6263fc314f92b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll @@ -188,9 +188,9 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s2 th:TH_LOAD_NT_HT -; GFX12-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s2 th:TH_LOAD_NT +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-WGP-NEXT: s_nop 0 ; GFX12-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -202,9 +202,9 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-CU-NEXT: scratch_load_b32 v0, off, s2 th:TH_LOAD_NT_HT -; GFX12-CU-NEXT: s_waitcnt vmcnt(0) +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s2 th:TH_LOAD_NT +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-CU-NEXT: s_nop 0 ; GFX12-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -398,10 +398,10 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2 -; GFX12-WGP-NEXT: scratch_load_b32 v0, v0, off th:TH_LOAD_NT_HT -; GFX12-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX12-WGP-NEXT: scratch_load_b32 v0, v0, off th:TH_LOAD_NT +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-WGP-NEXT: s_nop 0 ; GFX12-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -413,10 +413,10 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2 -; GFX12-CU-NEXT: scratch_load_b32 v0, v0, off th:TH_LOAD_NT_HT -; GFX12-CU-NEXT: s_waitcnt vmcnt(0) +; GFX12-CU-NEXT: scratch_load_b32 v0, v0, off th:TH_LOAD_NT +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-CU-NEXT: s_nop 0 ; GFX12-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -596,21 +596,21 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; GFX12-WGP-LABEL: private_nontemporal_store_0: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s2 th:TH_STORE_NT_WB +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s2 th:TH_STORE_NT ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_nontemporal_store_0: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 -; GFX12-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX12-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-CU-NEXT: scratch_store_b32 off, v0, s2 th:TH_STORE_NT_WB +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s2 th:TH_STORE_NT ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(5) %out) { entry: @@ -792,23 +792,23 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; GFX12-WGP-LABEL: private_nontemporal_store_1: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2 -; GFX12-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-WGP-NEXT: scratch_store_b32 v0, v1, off th:TH_STORE_NT_WB +; GFX12-WGP-NEXT: scratch_store_b32 v0, v1, off th:TH_STORE_NT ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_nontemporal_store_1: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 -; GFX12-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2 -; GFX12-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: scratch_store_b32 v0, v1, off th:TH_STORE_NT_WB +; GFX12-CU-NEXT: scratch_store_b32 v0, v1, off th:TH_STORE_NT ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(5) %out) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll index 4b1fb295adec2a..0fa4ea8acfbb3e 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll @@ -132,9 +132,9 @@ define amdgpu_kernel void @private_volatile_load_0( ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s2 th:TH_LOAD_RT_NT -; GFX12-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s2 scope:SCOPE_SYS +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-WGP-NEXT: s_nop 0 ; GFX12-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -146,9 +146,9 @@ define amdgpu_kernel void @private_volatile_load_0( ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-CU-NEXT: scratch_load_b32 v0, off, s2 th:TH_LOAD_RT_NT -; GFX12-CU-NEXT: s_waitcnt vmcnt(0) +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s2 scope:SCOPE_SYS +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-CU-NEXT: s_nop 0 ; GFX12-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -288,10 +288,10 @@ define amdgpu_kernel void @private_volatile_load_1( ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2 -; GFX12-WGP-NEXT: scratch_load_b32 v0, v0, off th:TH_LOAD_RT_NT -; GFX12-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX12-WGP-NEXT: scratch_load_b32 v0, v0, off scope:SCOPE_SYS +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-WGP-NEXT: s_nop 0 ; GFX12-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -303,10 +303,10 @@ define amdgpu_kernel void @private_volatile_load_1( ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2 -; GFX12-CU-NEXT: scratch_load_b32 v0, v0, off th:TH_LOAD_RT_NT -; GFX12-CU-NEXT: s_waitcnt vmcnt(0) +; GFX12-CU-NEXT: scratch_load_b32 v0, v0, off scope:SCOPE_SYS +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-CU-NEXT: s_nop 0 ; GFX12-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -441,23 +441,23 @@ define amdgpu_kernel void @private_volatile_store_0( ; GFX12-WGP-LABEL: private_volatile_store_0: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s2 th:TH_STORE_NT_RT -; GFX12-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s2 scope:SCOPE_SYS +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_volatile_store_0: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 -; GFX12-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX12-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-CU-NEXT: scratch_store_b32 off, v0, s2 th:TH_STORE_NT_RT -; GFX12-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s2 scope:SCOPE_SYS +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(5) %out) { entry: @@ -592,25 +592,25 @@ define amdgpu_kernel void @private_volatile_store_1( ; GFX12-WGP-LABEL: private_volatile_store_1: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2 -; GFX12-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-WGP-NEXT: scratch_store_b32 v0, v1, off th:TH_STORE_NT_RT -; GFX12-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-WGP-NEXT: scratch_store_b32 v0, v1, off scope:SCOPE_SYS +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: private_volatile_store_1: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 -; GFX12-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2 -; GFX12-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: scratch_store_b32 v0, v1, off th:TH_STORE_NT_RT -; GFX12-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-CU-NEXT: scratch_store_b32 v0, v1, off scope:SCOPE_SYS +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(5) %out) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index e2617fc453b58f..8cc85dbd6cf11c 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -114,13 +114,13 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 ; GFX12-NEXT: s_mov_b32 s11, s7 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s8, s2 ; GFX12-NEXT: s_mov_b32 s9, s3 ; GFX12-NEXT: s_mov_b32 s4, s0 ; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[8:11], null ; GFX12-NEXT: s_mov_b32 s5, s1 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mul_lo_u32 v1, v1, v3 ; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null @@ -276,7 +276,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 ; GFX12-NEXT: s_mov_b32 s11, s7 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s8, s2 ; GFX12-NEXT: s_mov_b32 s9, s3 ; GFX12-NEXT: s_clause 0x1 @@ -284,7 +284,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX12-NEXT: buffer_load_b128 v[4:7], off, s[8:11], null offset:16 ; GFX12-NEXT: s_mov_b32 s4, s0 ; GFX12-NEXT: s_mov_b32 s5, s1 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mul_lo_u32 v3, v3, v7 ; GFX12-NEXT: v_mul_lo_u32 v2, v2, v6 ; GFX12-NEXT: v_mul_lo_u32 v1, v1, v5 @@ -403,7 +403,7 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mul_i32 s0, s0, s6 ; GFX12-NEXT: s_mov_b32 s6, -1 @@ -556,14 +556,14 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add ; GFX12-NEXT: s_mov_b32 s15, s11 ; GFX12-NEXT: s_mov_b32 s2, s10 ; GFX12-NEXT: s_mov_b32 s3, s11 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s12, s6 ; GFX12-NEXT: s_mov_b32 s13, s7 ; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null ; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null ; GFX12-NEXT: s_mov_b32 s8, s4 ; GFX12-NEXT: s_mov_b32 s9, s5 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mul_lo_u32 v0, v1, v0 ; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null ; GFX12-NEXT: s_nop 0 @@ -677,7 +677,7 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; GFX12-LABEL: mul64_sext_c: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_ashr_i32 s3, s2, 31 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50 @@ -786,7 +786,7 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s3, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50 ; GFX12-NEXT: s_mov_b32 s3, 0x31016000 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -922,13 +922,13 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 ; GFX12-NEXT: s_mov_b32 s11, s7 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s8, s2 ; GFX12-NEXT: s_mov_b32 s9, s3 ; GFX12-NEXT: s_mov_b32 s4, s0 ; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null ; GFX12-NEXT: s_mov_b32 s5, s1 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mul_hi_i32 v1, 0x50, v0 ; GFX12-NEXT: v_mul_lo_u32 v0, 0x50, v0 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null @@ -1069,13 +1069,13 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 ; GFX12-NEXT: s_mov_b32 s11, s7 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s8, s2 ; GFX12-NEXT: s_mov_b32 s9, s3 ; GFX12-NEXT: s_mov_b32 s4, s0 ; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null ; GFX12-NEXT: s_mov_b32 s5, s1 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mul_hi_u32 v1, 0x50, v0 ; GFX12-NEXT: v_mul_lo_u32 v0, 0x50, v0 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null @@ -1213,13 +1213,13 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 ; GFX12-NEXT: s_mov_b32 s11, s7 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s8, s2 ; GFX12-NEXT: s_mov_b32 s9, s3 ; GFX12-NEXT: s_mov_b32 s4, s0 ; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null ; GFX12-NEXT: s_mov_b32 s5, s1 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mul_hi_i32 v1, 9, v0 ; GFX12-NEXT: v_mul_lo_u32 v0, 9, v0 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null @@ -1329,7 +1329,7 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [ ; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x4c ; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x70 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mul_i32 s2, s2, s3 ; GFX12-NEXT: s_mov_b32 s3, 0x31016000 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -1455,13 +1455,13 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 ; GFX12-NEXT: s_mov_b32 s11, s7 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s8, s2 ; GFX12-NEXT: s_mov_b32 s9, s3 ; GFX12-NEXT: s_mov_b32 s4, s0 ; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_mov_b32 s5, s1 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mul_lo_u32 v0, v0, v1 ; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null ; GFX12-NEXT: s_nop 0 @@ -1573,7 +1573,7 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 ; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x4c ; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x70 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mul_lo_u16 v0, s2, s3 ; GFX12-NEXT: s_mov_b32 s3, 0x31016000 ; GFX12-NEXT: s_mov_b32 s2, -1 @@ -1730,7 +1730,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 ; GFX12-NEXT: s_mov_b32 s11, s7 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s8, s2 ; GFX12-NEXT: s_mov_b32 s9, s3 ; GFX12-NEXT: s_clause 0x1 @@ -1738,7 +1738,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX12-NEXT: buffer_load_u8 v1, off, s[8:11], null offset:4 ; GFX12-NEXT: s_mov_b32 s4, s0 ; GFX12-NEXT: s_mov_b32 s5, s1 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 @@ -1898,7 +1898,7 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mul_u64 s[0:1], s[6:7], s[0:1] ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -2080,14 +2080,14 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX12-NEXT: s_mov_b32 s3, s11 ; GFX12-NEXT: s_mov_b32 s14, s10 ; GFX12-NEXT: s_mov_b32 s15, s11 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s12, s6 ; GFX12-NEXT: s_mov_b32 s13, s7 ; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[0:3], null ; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[12:15], null ; GFX12-NEXT: s_mov_b32 s8, s4 ; GFX12-NEXT: s_mov_b32 s9, s5 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mul_lo_u32 v3, v0, v3 ; GFX12-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX12-NEXT: v_mul_hi_u32 v4, v0, v2 @@ -2313,7 +2313,7 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_lg_u32 s2, 0 ; GFX12-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX12-NEXT: ; %bb.1: ; %else @@ -2329,7 +2329,7 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX12-NEXT: ; %bb.4: ; %if ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s6, -1 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s4, s2 ; GFX12-NEXT: s_mov_b32 s5, s3 ; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null @@ -2337,10 +2337,10 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX12-NEXT: .LBB15_5: ; GFX12-NEXT: v_mov_b32_e32 v0, s5 ; GFX12-NEXT: .LBB15_6: ; %endif -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s3, 0x31016000 ; GFX12-NEXT: s_mov_b32 s2, -1 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2580,7 +2580,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX12-LABEL: mul64_in_branch: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX12-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX12-NEXT: ; %bb.1: ; %else @@ -2603,7 +2603,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX12-NEXT: .LBB16_5: ; %endif ; GFX12-NEXT: s_mov_b32 s3, 0x31016000 ; GFX12-NEXT: s_mov_b32 s2, -1 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2926,7 +2926,7 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; GFX12-NEXT: s_mov_b32 s17, s3 ; GFX12-NEXT: s_mov_b32 s19, s3 ; GFX12-NEXT: s_mov_b32 s24, s3 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s2, s4 ; GFX12-NEXT: s_mov_b32 s14, s8 ; GFX12-NEXT: s_mov_b32 s12, s9 @@ -3209,11 +3209,11 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c ; GFX12-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v15, 4, v0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_load_b128 v[0:3], v15, s[0:1] ; GFX12-NEXT: global_load_b128 v[4:7], v15, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v0, v4, 0 ; GFX12-NEXT: v_mul_lo_u32 v14, v5, v2 ; GFX12-NEXT: v_mul_lo_u32 v3, v4, v3 @@ -3347,7 +3347,11 @@ define i32 @mul_pow2_plus_1(i32 %val) { ; ; GFX12-LABEL: mul_pow2_plus_1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshl_add_u32 v0, v0, 3, v0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll index 5a8814e7b20fa3..6e27d3f51d6980 100644 --- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll @@ -37,9 +37,13 @@ define i8 @flat_inst_valu_offset_1(ptr %p) { ; ; GFX12-LABEL: flat_inst_valu_offset_1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:1 -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 1 %load = load i8, ptr %gep, align 4 @@ -72,9 +76,13 @@ define i8 @flat_inst_valu_offset_11bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_valu_offset_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:2047 -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 2047 %load = load i8, ptr %gep, align 4 @@ -107,9 +115,13 @@ define i8 @flat_inst_valu_offset_12bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_valu_offset_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095 -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 4095 %load = load i8, ptr %gep, align 4 @@ -146,9 +158,13 @@ define i8 @flat_inst_valu_offset_13bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_valu_offset_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191 -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_13bit_max: @@ -203,9 +219,13 @@ define i8 @flat_inst_valu_offset_24bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_valu_offset_24bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8388607 -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_24bit_max: @@ -260,9 +280,13 @@ define i8 @flat_inst_valu_offset_neg_11bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_valu_offset_neg_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-2048 -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -2048 %load = load i8, ptr %gep, align 4 @@ -299,9 +323,13 @@ define i8 @flat_inst_valu_offset_neg_12bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_valu_offset_neg_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -4096 %load = load i8, ptr %gep, align 4 @@ -338,9 +366,13 @@ define i8 @flat_inst_valu_offset_neg_13bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_valu_offset_neg_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -8192 %load = load i8, ptr %gep, align 4 @@ -377,9 +409,13 @@ define i8 @flat_inst_valu_offset_neg_24bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_valu_offset_neg_24bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8388608 -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -8388608 %load = load i8, ptr %gep, align 4 @@ -413,9 +449,13 @@ define i8 @flat_inst_valu_offset_2x_11bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_valu_offset_2x_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095 -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 4095 %load = load i8, ptr %gep, align 4 @@ -452,9 +492,13 @@ define i8 @flat_inst_valu_offset_2x_12bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_valu_offset_2x_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191 -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_12bit_max: @@ -509,9 +553,13 @@ define i8 @flat_inst_valu_offset_2x_13bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_valu_offset_2x_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:16383 -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_13bit_max: @@ -566,11 +614,15 @@ define i8 @flat_inst_valu_offset_2x_24bit_max(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_valu_offset_2x_24bit_max: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8388606 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_24bit_max: @@ -593,11 +645,15 @@ define i8 @flat_inst_valu_offset_2x_24bit_max(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_24bit_max: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffffe, v0 ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 16777214 %load = load i8, ptr %gep, align 4 @@ -634,9 +690,13 @@ define i8 @flat_inst_valu_offset_2x_neg_11bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_valu_offset_2x_neg_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -4096 %load = load i8, ptr %gep, align 4 @@ -673,9 +733,13 @@ define i8 @flat_inst_valu_offset_2x_neg_12bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_valu_offset_2x_neg_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -8192 %load = load i8, ptr %gep, align 4 @@ -712,9 +776,13 @@ define i8 @flat_inst_valu_offset_2x_neg_13bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_valu_offset_2x_neg_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-16384 -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -16384 %load = load i8, ptr %gep, align 4 @@ -751,20 +819,28 @@ define i8 @flat_inst_valu_offset_2x_neg_24bit_max(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_valu_offset_2x_neg_24bit_max: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xff800000, v0 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8388607 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_neg_24bit_max: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xff000001, v0 ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -16777215 %load = load i8, ptr %gep, align 4 @@ -802,11 +878,15 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2047 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split0: @@ -851,7 +931,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x7ff ; GFX12-GISEL-NEXT: s_mov_b32 s1, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -860,7 +944,7 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) { ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 8589936639 %load = load i8, ptr %gep, align 4 @@ -898,11 +982,15 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2048 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split1: @@ -947,7 +1035,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x800 ; GFX12-GISEL-NEXT: s_mov_b32 s1, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -956,7 +1048,7 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) { ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 8589936640 %load = load i8, ptr %gep, align 4 @@ -994,11 +1086,15 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split0: @@ -1043,7 +1139,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_movk_i32 s0, 0xfff ; GFX12-GISEL-NEXT: s_mov_b32 s1, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1052,7 +1152,7 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) { ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 8589938687 %load = load i8, ptr %gep, align 4 @@ -1090,11 +1190,15 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4096 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split1: @@ -1139,7 +1243,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x1000 ; GFX12-GISEL-NEXT: s_mov_b32 s1, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1148,7 +1256,7 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) { ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 8589938688 %load = load i8, ptr %gep, align 4 @@ -1186,11 +1294,15 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8191 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split0: @@ -1235,7 +1347,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x1fff ; GFX12-GISEL-NEXT: s_mov_b32 s1, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1244,7 +1360,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) { ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 8589942783 %load = load i8, ptr %gep, align 4 @@ -1282,11 +1398,15 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8192 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split1: @@ -1331,7 +1451,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x2000 ; GFX12-GISEL-NEXT: s_mov_b32 s1, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1340,7 +1464,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) { ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 8589942784 %load = load i8, ptr %gep, align 4 @@ -1379,11 +1503,15 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386561 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0: @@ -1428,7 +1556,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x7ff ; GFX12-GISEL-NEXT: s_brev_b32 s1, 1 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1437,7 +1569,7 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) { ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -9223372036854773761 %load = load i8, ptr %gep, align 4 @@ -1476,11 +1608,15 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386560 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1: @@ -1525,7 +1661,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x800 ; GFX12-GISEL-NEXT: s_brev_b32 s1, 1 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1534,7 +1674,7 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) { ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -9223372036854773760 %load = load i8, ptr %gep, align 4 @@ -1573,11 +1713,15 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384513 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0: @@ -1622,7 +1766,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_movk_i32 s0, 0xfff ; GFX12-GISEL-NEXT: s_brev_b32 s1, 1 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1631,7 +1779,7 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) { ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -9223372036854771713 %load = load i8, ptr %gep, align 4 @@ -1670,11 +1818,15 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384512 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1: @@ -1719,7 +1871,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x1000 ; GFX12-GISEL-NEXT: s_brev_b32 s1, 1 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1728,7 +1884,7 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) { ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -9223372036854771712 %load = load i8, ptr %gep, align 4 @@ -1767,11 +1923,15 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380417 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0: @@ -1816,7 +1976,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x1fff ; GFX12-GISEL-NEXT: s_brev_b32 s1, 1 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1825,7 +1989,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) { ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -9223372036854767617 %load = load i8, ptr %gep, align 4 @@ -1864,11 +2028,15 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380416 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1: @@ -1913,7 +2081,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x2000 ; GFX12-GISEL-NEXT: s_brev_b32 s1, 1 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1922,7 +2094,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) { ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -9223372036854767616 %load = load i8, ptr %gep, align 4 @@ -1967,10 +2139,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) { ; GFX12-LABEL: flat_inst_salu_offset_1: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:1 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:1 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 1 @@ -2017,10 +2189,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) { ; GFX12-LABEL: flat_inst_salu_offset_11bit_max: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:2047 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:2047 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 2047 @@ -2067,10 +2239,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) { ; GFX12-LABEL: flat_inst_salu_offset_12bit_max: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 4095 @@ -2121,10 +2293,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { ; GFX12-LABEL: flat_inst_salu_offset_13bit_max: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-NEXT: s_endpgm ; @@ -2201,10 +2373,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { ; GFX12-LABEL: flat_inst_salu_offset_neg_11bit_max: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-2048 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-2048 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-NEXT: s_endpgm ; @@ -2281,10 +2453,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { ; GFX12-LABEL: flat_inst_salu_offset_neg_12bit_max: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-NEXT: s_endpgm ; @@ -2361,10 +2533,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { ; GFX12-LABEL: flat_inst_salu_offset_neg_13bit_max: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-NEXT: s_endpgm ; @@ -2437,10 +2609,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) { ; GFX12-LABEL: flat_inst_salu_offset_2x_11bit_max: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 4095 @@ -2491,10 +2663,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { ; GFX12-LABEL: flat_inst_salu_offset_2x_12bit_max: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-NEXT: s_endpgm ; @@ -2571,10 +2743,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { ; GFX12-LABEL: flat_inst_salu_offset_2x_13bit_max: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:16383 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:16383 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-NEXT: s_endpgm ; @@ -2651,10 +2823,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { ; GFX12-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-NEXT: s_endpgm ; @@ -2731,10 +2903,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { ; GFX12-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-NEXT: s_endpgm ; @@ -2811,10 +2983,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { ; GFX12-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-16384 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-16384 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-NEXT: s_endpgm ; @@ -2891,12 +3063,12 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split0: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 -; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2047 th:TH_LOAD_RT_NT -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2047 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-SDAG-NEXT: s_endpgm ; @@ -2929,13 +3101,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split0: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 8589936639 @@ -2986,12 +3158,12 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split1: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 -; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2048 th:TH_LOAD_RT_NT -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2048 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-SDAG-NEXT: s_endpgm ; @@ -3024,13 +3196,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split1: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 8589936640 @@ -3081,12 +3253,12 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split0: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 -; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 th:TH_LOAD_RT_NT -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-SDAG-NEXT: s_endpgm ; @@ -3119,13 +3291,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split0: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 8589938687 @@ -3177,12 +3349,12 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split1: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 -; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4096 th:TH_LOAD_RT_NT -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4096 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-SDAG-NEXT: s_endpgm ; @@ -3215,13 +3387,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split1: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 8589938688 @@ -3273,12 +3445,12 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split0: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 -; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8191 th:TH_LOAD_RT_NT -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-SDAG-NEXT: s_endpgm ; @@ -3311,13 +3483,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split0: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 8589942783 @@ -3369,12 +3541,12 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split1: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 -; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8192 th:TH_LOAD_RT_NT -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8192 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-SDAG-NEXT: s_endpgm ; @@ -3407,13 +3579,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split1: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 8589942784 @@ -3467,13 +3639,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo -; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386561 th:TH_LOAD_RT_NT -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386561 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-SDAG-NEXT: s_endpgm ; @@ -3506,13 +3678,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 -9223372036854773761 @@ -3566,13 +3738,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo -; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386560 th:TH_LOAD_RT_NT -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386560 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-SDAG-NEXT: s_endpgm ; @@ -3605,13 +3777,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 -9223372036854773760 @@ -3665,13 +3837,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo -; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384513 th:TH_LOAD_RT_NT -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384513 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-SDAG-NEXT: s_endpgm ; @@ -3704,13 +3876,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 -9223372036854771713 @@ -3764,13 +3936,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo -; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384512 th:TH_LOAD_RT_NT -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384512 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-SDAG-NEXT: s_endpgm ; @@ -3803,13 +3975,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 -9223372036854771712 @@ -3863,13 +4035,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo -; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380417 th:TH_LOAD_RT_NT -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380417 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-SDAG-NEXT: s_endpgm ; @@ -3902,13 +4074,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 -9223372036854767617 @@ -3962,13 +4134,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo -; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380416 th:TH_LOAD_RT_NT -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380416 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-SDAG-NEXT: s_endpgm ; @@ -4001,13 +4173,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm %gep = getelementptr i8, ptr %p, i64 -9223372036854767616 diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll index 88be7e54ced107..97dd288004edf6 100644 --- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll @@ -35,9 +35,13 @@ define i8 @global_inst_valu_offset_1(ptr addrspace(1) %p) { ; ; GFX12-LABEL: global_inst_valu_offset_1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:1 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 1 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -68,9 +72,13 @@ define i8 @global_inst_valu_offset_11bit_max(ptr addrspace(1) %p) { ; ; GFX12-LABEL: global_inst_valu_offset_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:2047 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 2047 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -103,9 +111,13 @@ define i8 @global_inst_valu_offset_12bit_max(ptr addrspace(1) %p) { ; ; GFX12-LABEL: global_inst_valu_offset_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:4095 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: global_inst_valu_offset_12bit_max: @@ -151,9 +163,13 @@ define i8 @global_inst_valu_offset_13bit_max(ptr addrspace(1) %p) { ; ; GFX12-LABEL: global_inst_valu_offset_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:8191 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_13bit_max: @@ -217,9 +233,13 @@ define i8 @global_inst_valu_offset_24bit_max(ptr addrspace(1) %p) { ; ; GFX12-LABEL: global_inst_valu_offset_24bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:8388607 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_24bit_max: @@ -277,9 +297,13 @@ define i8 @global_inst_valu_offset_neg_11bit_max(ptr addrspace(1) %p) { ; ; GFX12-LABEL: global_inst_valu_offset_neg_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-2048 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -2048 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -312,9 +336,13 @@ define i8 @global_inst_valu_offset_neg_12bit_max(ptr addrspace(1) %p) { ; ; GFX12-LABEL: global_inst_valu_offset_neg_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-4096 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -351,9 +379,13 @@ define i8 @global_inst_valu_offset_neg_13bit_max(ptr addrspace(1) %p) { ; ; GFX12-LABEL: global_inst_valu_offset_neg_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-8192 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -390,9 +422,13 @@ define i8 @global_inst_valu_offset_neg_24bit_max(ptr addrspace(1) %p) { ; ; GFX12-LABEL: global_inst_valu_offset_neg_24bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-8388608 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8388608 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -425,9 +461,13 @@ define i8 @global_inst_valu_offset_2x_11bit_max(ptr addrspace(1) %p) { ; ; GFX12-LABEL: global_inst_valu_offset_2x_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:4095 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: global_inst_valu_offset_2x_11bit_max: @@ -473,9 +513,13 @@ define i8 @global_inst_valu_offset_2x_12bit_max(ptr addrspace(1) %p) { ; ; GFX12-LABEL: global_inst_valu_offset_2x_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:8191 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_12bit_max: @@ -539,9 +583,13 @@ define i8 @global_inst_valu_offset_2x_13bit_max(ptr addrspace(1) %p) { ; ; GFX12-LABEL: global_inst_valu_offset_2x_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:16383 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_13bit_max: @@ -605,11 +653,15 @@ define i8 @global_inst_valu_offset_2x_24bit_max(ptr addrspace(1) %p) { ; ; GFX12-GISEL-LABEL: global_inst_valu_offset_2x_24bit_max: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffffe, v0 ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_24bit_max: @@ -641,11 +693,15 @@ define i8 @global_inst_valu_offset_2x_24bit_max(ptr addrspace(1) %p) { ; ; GFX12-SDAG-LABEL: global_inst_valu_offset_2x_24bit_max: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8388606 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 16777214 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -678,9 +734,13 @@ define i8 @global_inst_valu_offset_2x_neg_11bit_max(ptr addrspace(1) %p) { ; ; GFX12-LABEL: global_inst_valu_offset_2x_neg_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-4096 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -717,9 +777,13 @@ define i8 @global_inst_valu_offset_2x_neg_12bit_max(ptr addrspace(1) %p) { ; ; GFX12-LABEL: global_inst_valu_offset_2x_neg_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-8192 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -756,9 +820,13 @@ define i8 @global_inst_valu_offset_2x_neg_13bit_max(ptr addrspace(1) %p) { ; ; GFX12-LABEL: global_inst_valu_offset_2x_neg_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-16384 -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -16384 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -795,11 +863,15 @@ define i8 @global_inst_valu_offset_2x_neg_24bit_max(ptr addrspace(1) %p) { ; ; GFX12-GISEL-LABEL: global_inst_valu_offset_2x_neg_24bit_max: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xff000001, v0 ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_neg_24bit_max: @@ -831,11 +903,15 @@ define i8 @global_inst_valu_offset_2x_neg_24bit_max(ptr addrspace(1) %p) { ; ; GFX12-SDAG-LABEL: global_inst_valu_offset_2x_neg_24bit_max: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xff800000, v0 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8388607 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -16777215 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -887,7 +963,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) { ; ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x7ff ; GFX12-GISEL-NEXT: s_mov_b32 s1, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -896,7 +976,7 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) { ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split0: @@ -928,11 +1008,15 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) { ; ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2047 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936639 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -983,7 +1067,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) { ; ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x800 ; GFX12-GISEL-NEXT: s_mov_b32 s1, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -992,7 +1080,7 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) { ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split1: @@ -1024,11 +1112,15 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) { ; ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2048 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936640 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1079,7 +1171,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) { ; ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_movk_i32 s0, 0xfff ; GFX12-GISEL-NEXT: s_mov_b32 s1, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1088,7 +1184,7 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) { ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split0: @@ -1120,11 +1216,15 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) { ; ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938687 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1175,7 +1275,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) { ; ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x1000 ; GFX12-GISEL-NEXT: s_mov_b32 s1, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1184,7 +1288,7 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) { ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split1: @@ -1216,11 +1320,15 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) { ; ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4096 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938688 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1271,7 +1379,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) { ; ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x1fff ; GFX12-GISEL-NEXT: s_mov_b32 s1, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1280,7 +1392,7 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) { ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split0: @@ -1312,11 +1424,15 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) { ; ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8191 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942783 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1367,7 +1483,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) { ; ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x2000 ; GFX12-GISEL-NEXT: s_mov_b32 s1, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1376,7 +1496,7 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) { ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split1: @@ -1408,11 +1528,15 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) { ; ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8192 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942784 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1463,7 +1587,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1) ; ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x7ff ; GFX12-GISEL-NEXT: s_brev_b32 s1, 1 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1472,7 +1600,7 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1) ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0: @@ -1505,11 +1633,15 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1) ; ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8386561 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773761 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1560,7 +1692,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1) ; ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x800 ; GFX12-GISEL-NEXT: s_brev_b32 s1, 1 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1569,7 +1705,7 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1) ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1: @@ -1602,11 +1738,15 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1) ; ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8386560 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773760 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1657,7 +1797,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1) ; ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_movk_i32 s0, 0xfff ; GFX12-GISEL-NEXT: s_brev_b32 s1, 1 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1666,7 +1810,7 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1) ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0: @@ -1699,11 +1843,15 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1) ; ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8384513 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771713 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1754,7 +1902,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1) ; ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x1000 ; GFX12-GISEL-NEXT: s_brev_b32 s1, 1 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1763,7 +1915,7 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1) ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1: @@ -1796,11 +1948,15 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1) ; ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8384512 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771712 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1851,7 +2007,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1) ; ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x1fff ; GFX12-GISEL-NEXT: s_brev_b32 s1, 1 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1860,7 +2020,7 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1) ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0: @@ -1893,11 +2053,15 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1) ; ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8380417 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767617 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1948,7 +2112,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1) ; ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x2000 ; GFX12-GISEL-NEXT: s_brev_b32 s1, 1 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1957,7 +2125,7 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1) ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1: @@ -1990,11 +2158,15 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1) ; ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8380416 -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767616 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -2038,9 +2210,9 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) { ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:1 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:1 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2088,9 +2260,9 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:2047 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:2047 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2138,9 +2310,9 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2188,9 +2360,9 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:8191 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2238,9 +2410,9 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1 ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-2048 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-2048 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2291,9 +2463,9 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1 ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2360,9 +2532,9 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1 ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-8192 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2435,9 +2607,9 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1) ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2485,9 +2657,9 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1) ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:8191 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2535,9 +2707,9 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1) ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:16383 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:16383 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2588,9 +2760,9 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2657,9 +2829,9 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-8192 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2740,9 +2912,9 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-16384 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-16384 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2823,13 +2995,13 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off th:TH_LOAD_RT_NT -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2863,12 +3035,12 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 -; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2047 th:TH_LOAD_RT_NT -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2047 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2924,13 +3096,13 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off th:TH_LOAD_RT_NT -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2964,12 +3136,12 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 -; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2048 th:TH_LOAD_RT_NT -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2048 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3025,13 +3197,13 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off th:TH_LOAD_RT_NT -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3065,12 +3237,12 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 -; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 th:TH_LOAD_RT_NT -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3126,13 +3298,13 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off th:TH_LOAD_RT_NT -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3166,12 +3338,12 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 -; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4096 th:TH_LOAD_RT_NT -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4096 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3227,13 +3399,13 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off th:TH_LOAD_RT_NT -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3267,12 +3439,12 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 -; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8191 th:TH_LOAD_RT_NT -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8191 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3328,13 +3500,13 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off th:TH_LOAD_RT_NT -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3368,12 +3540,12 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 -; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8192 th:TH_LOAD_RT_NT -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8192 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3428,11 +3600,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 -; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3444,10 +3616,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x7ff ; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] -; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3502,11 +3674,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 -; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3518,10 +3690,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x800 ; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] -; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3576,11 +3748,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 -; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3592,10 +3764,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0xfff ; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] -; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3650,11 +3822,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 -; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3666,10 +3838,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1000 ; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] -; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3724,11 +3896,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 -; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3740,10 +3912,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1fff ; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] -; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3798,11 +3970,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 -; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT -; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3814,10 +3986,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x2000 ; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 -; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] -; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT -; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll index 17b3fdc04ec934..c272e8e9788ea0 100644 --- a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll +++ b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll @@ -21,9 +21,9 @@ declare i64 @llvm.readcyclecounter() #0 ; GFX12: s_getreg_b32 [[HI2:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES_HI) ; GFX12: s_cmp_eq_u32 [[HI1]], [[HI2]] ; GFX12: s_cselect_b32 {{s[0-9]+}}, [[LO1]], 0 -; GCN-DAG: lgkmcnt +; GCN-DAG: kmcnt ; MEMTIME: store_dwordx2 -; SIVI-NOT: lgkmcnt +; SIVI-NOT: kmcnt ; MEMTIME: s_memtime s{{\[[0-9]+:[0-9]+\]}} ; MEMTIME: store_dwordx2 diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll index 4695cadd45aeed..65c75363f43466 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.ll @@ -44,7 +44,7 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX12-LABEL: s_sub_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_sub_co_i32 s2, s2, s3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -96,7 +96,7 @@ define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { ; GFX12-LABEL: s_sub_imm_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_sub_co_i32 s2, 0x4d2, s2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -157,9 +157,9 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_nc_u32_e32 v0, v0, v1 ; GFX12-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -221,9 +221,9 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0x7b, v1 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -286,9 +286,9 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b128 v[0:3], v4, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_nc_u32_e32 v1, v1, v3 ; GFX12-NEXT: v_sub_nc_u32_e32 v0, v0, v2 ; GFX12-NEXT: global_store_b64 v4, v[0:1], s[0:1] @@ -367,11 +367,11 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_load_b128 v[0:3], v8, s[2:3] offset:16 ; GFX12-NEXT: global_load_b128 v[4:7], v8, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_nc_u32_e32 v3, v7, v3 ; GFX12-NEXT: v_sub_nc_u32_e32 v2, v6, v2 ; GFX12-NEXT: v_sub_nc_u32_e32 v1, v5, v1 @@ -449,11 +449,11 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) -; GFX12-NEXT: global_load_u16 v0, v0, s[2:3] offset:2 th:TH_LOAD_RT_NT -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_load_u16 v0, v0, s[2:3] offset:2 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_nc_u16 v0, v1, v0 ; GFX12-NEXT: global_store_b16 v2, v0, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -530,9 +530,9 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_sub_i16 v0, v0, v1 ; GFX12-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -620,9 +620,9 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_sub_i16 v1, v1, v3 ; GFX12-NEXT: v_pk_sub_i16 v0, v0, v2 ; GFX12-NEXT: global_store_b64 v4, v[0:1], s[0:1] @@ -686,7 +686,7 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_sub_nc_u64 s[2:3], s[4:5], s[6:7] ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -767,11 +767,11 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[6:7] ; GFX12-NEXT: global_load_b64 v[2:3], v2, s[0:1] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 ; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-NEXT: global_store_b64 v4, v[0:1], s[4:5] @@ -862,11 +862,11 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_load_b128 v[0:3], v4, s[6:7] ; GFX12-NEXT: global_load_b128 v[4:7], v4, s[0:1] -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v6 ; GFX12-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo ; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4 @@ -991,16 +991,16 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_lshlrev_b32_e32 v12, 5, v0 ; GFX12-NEXT: v_mov_b32_e32 v16, 0 -; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_load_b128 v[0:3], v12, s[6:7] ; GFX12-NEXT: global_load_b128 v[4:7], v12, s[0:1] ; GFX12-NEXT: global_load_b128 v[8:11], v12, s[6:7] offset:16 ; GFX12-NEXT: global_load_b128 v[12:15], v12, s[0:1] offset:16 -; GFX12-NEXT: s_waitcnt vmcnt(2) +; GFX12-NEXT: s_wait_loadcnt 0x2 ; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v6 ; GFX12-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo -; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_co_u32 v10, vcc_lo, v10, v14 ; GFX12-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v11, v15, vcc_lo ; GFX12-NEXT: v_sub_co_u32 v8, vcc_lo, v8, v12 diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-global-inv-wb.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-global-inv-wb.mir index c06e931c65d5e4..ab20333cee9d0d 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-global-inv-wb.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-global-inv-wb.mir @@ -19,7 +19,7 @@ body: | ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, killed $vgpr0, 0, 0, implicit $exec :: (load (s32), addrspace 1) ; GFX12-NEXT: GLOBAL_INV 16, implicit $exec - ; GFX12-NEXT: S_WAITCNT 1015 + ; GFX12-NEXT: S_WAIT_LOADCNT 0 ; GFX12-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $exec renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, killed $vgpr0, 0, 0, implicit $exec :: (load (s32), addrspace 1) GLOBAL_INV 16, implicit $exec