diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index fd47970bd05059..6d94cfdc720919 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -689,10 +689,18 @@ void AArch64FrameLowering::emitCalleeSavedSVERestores( emitCalleeSavedRestores(MBB, MBBI, true); } +// Return the maximum possible number of bytes for `Size` due to the +// architectural limit on the size of a SVE register. +static int64_t upperBound(StackOffset Size) { + static const int64_t MAX_BYTES_PER_SCALABLE_BYTE = 16; + return Size.getScalable() * MAX_BYTES_PER_SCALABLE_BYTE + Size.getFixed(); +} + void AArch64FrameLowering::allocateStackSpace( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - bool NeedsRealignment, StackOffset AllocSize, bool NeedsWinCFI, - bool *HasWinCFI, bool EmitCFI, StackOffset InitialOffset) const { + int64_t RealignmentPadding, StackOffset AllocSize, bool NeedsWinCFI, + bool *HasWinCFI, bool EmitCFI, StackOffset InitialOffset, + bool FollowupAllocs) const { if (!AllocSize) return; @@ -704,27 +712,128 @@ void AArch64FrameLowering::allocateStackSpace( AArch64FunctionInfo &AFI = *MF.getInfo(); const MachineFrameInfo &MFI = MF.getFrameInfo(); - Register TargetReg = - NeedsRealignment ? findScratchNonCalleeSaveRegister(&MBB) : AArch64::SP; - // SUB Xd/SP, SP, AllocSize + const int64_t MaxAlign = MFI.getMaxAlign().value(); + const uint64_t AndMask = ~(MaxAlign - 1); + + if (!Subtarget.getTargetLowering()->hasInlineStackProbe(MF)) { + Register TargetReg = RealignmentPadding + ? findScratchNonCalleeSaveRegister(&MBB) + : AArch64::SP; + // SUB Xd/SP, SP, AllocSize + emitFrameOffset(MBB, MBBI, DL, TargetReg, AArch64::SP, -AllocSize, &TII, + MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI, + EmitCFI, InitialOffset); + + if (RealignmentPadding) { + // AND SP, X9, 0b11111...0000 + BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), AArch64::SP) + .addReg(TargetReg, RegState::Kill) + .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64)) + .setMIFlags(MachineInstr::FrameSetup); + AFI.setStackRealigned(true); + + // No need for SEH instructions here; if we're realigning the stack, + // we've set a frame pointer and already finished the SEH prologue. + assert(!NeedsWinCFI); + } + return; + } + + // + // Stack probing allocation. + // + + // Fixed length allocation. If we don't need to re-align the stack and don't + // have SVE objects, we can use a more efficient sequence for stack probing. + if (AllocSize.getScalable() == 0 && RealignmentPadding == 0) { + Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB); + assert(ScratchReg != AArch64::NoRegister); + BuildMI(MBB, MBBI, DL, TII.get(AArch64::PROBED_STACKALLOC)) + .addDef(ScratchReg) + .addImm(AllocSize.getFixed()) + .addImm(InitialOffset.getFixed()) + .addImm(InitialOffset.getScalable()); + // The fixed allocation may leave unprobed bytes at the top of the + // stack. If we have subsequent alocation (e.g. if we have variable-sized + // objects), we need to issue an extra probe, so these allocations start in + // a known state. + if (FollowupAllocs) { + // STR XZR, [SP] + BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(AArch64::SP) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + } + + return; + } + + // Variable length allocation. + + // If the (unknown) allocation size cannot exceed the probe size, decrement + // the stack pointer right away. + int64_t ProbeSize = AFI.getStackProbeSize(); + if (upperBound(AllocSize) + RealignmentPadding <= ProbeSize) { + Register ScratchReg = RealignmentPadding + ? findScratchNonCalleeSaveRegister(&MBB) + : AArch64::SP; + assert(ScratchReg != AArch64::NoRegister); + // SUB Xd, SP, AllocSize + emitFrameOffset(MBB, MBBI, DL, ScratchReg, AArch64::SP, -AllocSize, &TII, + MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI, + EmitCFI, InitialOffset); + if (RealignmentPadding) { + // AND SP, Xn, 0b11111...0000 + BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), AArch64::SP) + .addReg(ScratchReg, RegState::Kill) + .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64)) + .setMIFlags(MachineInstr::FrameSetup); + AFI.setStackRealigned(true); + } + if (FollowupAllocs || upperBound(AllocSize) + RealignmentPadding > + AArch64::StackProbeMaxUnprobedStack) { + // STR XZR, [SP] + BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(AArch64::SP) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + } + return; + } + + // Emit a variable-length allocation probing loop. + // TODO: As an optimisation, the loop can be "unrolled" into a few parts, + // each of them guaranteed to adjust the stack by less than the probe size. + Register TargetReg = findScratchNonCalleeSaveRegister(&MBB); + assert(TargetReg != AArch64::NoRegister); + // SUB Xd, SP, AllocSize emitFrameOffset(MBB, MBBI, DL, TargetReg, AArch64::SP, -AllocSize, &TII, MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI, EmitCFI, InitialOffset); - - if (NeedsRealignment) { - const int64_t MaxAlign = MFI.getMaxAlign().value(); - const uint64_t AndMask = ~(MaxAlign - 1); - // AND SP, Xd, 0b11111...0000 - BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), AArch64::SP) + if (RealignmentPadding) { + // AND Xn, Xn, 0b11111...0000 + BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), TargetReg) .addReg(TargetReg, RegState::Kill) .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64)) .setMIFlags(MachineInstr::FrameSetup); - AFI.setStackRealigned(true); + } - // No need for SEH instructions here; if we're realigning the stack, - // we've set a frame pointer and already finished the SEH prologue. - assert(!NeedsWinCFI); + BuildMI(MBB, MBBI, DL, TII.get(AArch64::PROBED_STACKALLOC_VAR)) + .addReg(TargetReg); + if (EmitCFI) { + // Set the CFA register back to SP. + unsigned Reg = + Subtarget.getRegisterInfo()->getDwarfRegNum(AArch64::SP, true); + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg)); + BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); } + if (RealignmentPadding) + AFI.setStackRealigned(true); } static MCRegister getRegisterOrZero(MCRegister Reg, bool HasSVE) { @@ -905,9 +1014,11 @@ bool AArch64FrameLowering::canUseAsPrologue( MachineBasicBlock *TmpMBB = const_cast(&MBB); const AArch64Subtarget &Subtarget = MF->getSubtarget(); const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + const AArch64TargetLowering *TLI = Subtarget.getTargetLowering(); - // Don't need a scratch register if we're not going to re-align the stack. - if (!RegInfo->hasStackRealignment(*MF)) + // Don't need a scratch register if we're not going to re-align the stack or + // emit stack probes. + if (!RegInfo->hasStackRealignment(*MF) && TLI->hasInlineStackProbe(*MF)) return true; // Otherwise, we can use any block as long as it has a scratch register // available. @@ -917,15 +1028,11 @@ bool AArch64FrameLowering::canUseAsPrologue( static bool windowsRequiresStackProbe(MachineFunction &MF, uint64_t StackSizeInBytes) { const AArch64Subtarget &Subtarget = MF.getSubtarget(); - if (!Subtarget.isTargetWindows()) - return false; - const Function &F = MF.getFunction(); + const AArch64FunctionInfo &MFI = *MF.getInfo(); // TODO: When implementing stack protectors, take that into account // for the probe threshold. - unsigned StackProbeSize = - F.getFnAttributeAsParsedInteger("stack-probe-size", 4096); - return (StackSizeInBytes >= StackProbeSize) && - !F.hasFnAttribute("no-stack-arg-probe"); + return Subtarget.isTargetWindows() && MFI.hasStackProbing() && + StackSizeInBytes >= uint64_t(MFI.getStackProbeSize()); } static bool needsWinCFI(const MachineFunction &MF) { @@ -1731,7 +1838,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // Alignment is required for the parent frame, not the funclet const bool NeedsRealignment = NumBytes && !IsFunclet && RegInfo->hasStackRealignment(MF); - int64_t RealignmentPadding = + const int64_t RealignmentPadding = (NeedsRealignment && MFI.getMaxAlign() > Align(16)) ? MFI.getMaxAlign().value() - 16 : 0; @@ -1867,6 +1974,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // Process the SVE callee-saves to determine what space needs to be // allocated. if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) { + LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize + << "\n"); // Find callee save instructions in frame. CalleeSavesBegin = MBBI; assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction"); @@ -1881,8 +1990,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // Allocate space for the callee saves (if any). StackOffset CFAOffset = StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes); - allocateStackSpace(MBB, CalleeSavesBegin, false, SVECalleeSavesSize, false, - nullptr, EmitAsyncCFI && !HasFP, CFAOffset); + StackOffset LocalsSize = SVELocalsSize + StackOffset::getFixed(NumBytes); + allocateStackSpace(MBB, CalleeSavesBegin, 0, SVECalleeSavesSize, false, + nullptr, EmitAsyncCFI && !HasFP, CFAOffset, + MFI.hasVarSizedObjects() || LocalsSize); CFAOffset += SVECalleeSavesSize; if (EmitAsyncCFI) @@ -1896,10 +2007,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have // the correct value here, as NumBytes also includes padding bytes, // which shouldn't be counted here. - allocateStackSpace(MBB, CalleeSavesEnd, NeedsRealignment, + allocateStackSpace(MBB, CalleeSavesEnd, RealignmentPadding, SVELocalsSize + StackOffset::getFixed(NumBytes), NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP, - CFAOffset); + CFAOffset, MFI.hasVarSizedObjects()); } // If we need a base pointer, set it up here. It's whatever the value of the @@ -4109,3 +4220,170 @@ void AArch64FrameLowering::orderFrameObjects( dbgs() << "\n"; }); } + +/// Emit a loop to decrement SP until it is equal to TargetReg, with probes at +/// least every ProbeSize bytes. Returns an iterator of the first instruction +/// after the loop. The difference between SP and TargetReg must be an exact +/// multiple of ProbeSize. +MachineBasicBlock::iterator +AArch64FrameLowering::inlineStackProbeLoopExactMultiple( + MachineBasicBlock::iterator MBBI, int64_t ProbeSize, + Register TargetReg) const { + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineFunction &MF = *MBB.getParent(); + const AArch64InstrInfo *TII = + MF.getSubtarget().getInstrInfo(); + DebugLoc DL = MBB.findDebugLoc(MBBI); + + MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator()); + MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, LoopMBB); + MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, ExitMBB); + + // SUB SP, SP, #ProbeSize (or equivalent if ProbeSize is not encodable + // in SUB). + emitFrameOffset(*LoopMBB, LoopMBB->end(), DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(-ProbeSize), TII, + MachineInstr::FrameSetup); + // STR XZR, [SP] + BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(AArch64::SP) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + // CMP SP, TargetReg + BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::SUBSXrx64), + AArch64::XZR) + .addReg(AArch64::SP) + .addReg(TargetReg) + .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0)) + .setMIFlags(MachineInstr::FrameSetup); + // B.CC Loop + BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::Bcc)) + .addImm(AArch64CC::NE) + .addMBB(LoopMBB) + .setMIFlags(MachineInstr::FrameSetup); + + LoopMBB->addSuccessor(ExitMBB); + LoopMBB->addSuccessor(LoopMBB); + // Synthesize the exit MBB. + ExitMBB->splice(ExitMBB->end(), &MBB, MBBI, MBB.end()); + ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB); + MBB.addSuccessor(LoopMBB); + // Update liveins. + recomputeLiveIns(*LoopMBB); + recomputeLiveIns(*ExitMBB); + + return ExitMBB->begin(); +} + +void AArch64FrameLowering::inlineStackProbeFixed( + MachineBasicBlock::iterator MBBI, Register ScratchReg, int64_t FrameSize, + StackOffset CFAOffset) const { + MachineBasicBlock *MBB = MBBI->getParent(); + MachineFunction &MF = *MBB->getParent(); + const AArch64InstrInfo *TII = + MF.getSubtarget().getInstrInfo(); + AArch64FunctionInfo *AFI = MF.getInfo(); + bool EmitAsyncCFI = AFI->needsAsyncDwarfUnwindInfo(MF); + bool HasFP = hasFP(MF); + + DebugLoc DL; + int64_t ProbeSize = MF.getInfo()->getStackProbeSize(); + int64_t NumBlocks = FrameSize / ProbeSize; + int64_t ResidualSize = FrameSize % ProbeSize; + + LLVM_DEBUG(dbgs() << "Stack probing: total " << FrameSize << " bytes, " + << NumBlocks << " blocks of " << ProbeSize + << " bytes, plus " << ResidualSize << " bytes\n"); + + // Decrement SP by NumBlock * ProbeSize bytes, with either unrolled or + // ordinary loop. + if (NumBlocks <= AArch64::StackProbeMaxLoopUnroll) { + for (int i = 0; i < NumBlocks; ++i) { + // SUB SP, SP, #ProbeSize (or equivalent if ProbeSize is not + // encodable in a SUB). + emitFrameOffset(*MBB, MBBI, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(-ProbeSize), TII, + MachineInstr::FrameSetup, false, false, nullptr, + EmitAsyncCFI && !HasFP, CFAOffset); + CFAOffset += StackOffset::getFixed(ProbeSize); + // STR XZR, [SP] + BuildMI(*MBB, MBBI, DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(AArch64::SP) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + } + } else if (NumBlocks != 0) { + // SUB ScratchReg, SP, #FrameSize (or equivalent if FrameSize is not + // encodable in ADD). ScrathReg may temporarily become the CFA register. + emitFrameOffset(*MBB, MBBI, DL, ScratchReg, AArch64::SP, + StackOffset::getFixed(-ProbeSize * NumBlocks), TII, + MachineInstr::FrameSetup, false, false, nullptr, + EmitAsyncCFI && !HasFP, CFAOffset); + CFAOffset += StackOffset::getFixed(ProbeSize * NumBlocks); + MBBI = inlineStackProbeLoopExactMultiple(MBBI, ProbeSize, ScratchReg); + MBB = MBBI->getParent(); + if (EmitAsyncCFI && !HasFP) { + // Set the CFA register back to SP. + const AArch64RegisterInfo &RegInfo = + *MF.getSubtarget().getRegisterInfo(); + unsigned Reg = RegInfo.getDwarfRegNum(AArch64::SP, true); + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg)); + BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + } + } + + if (ResidualSize != 0) { + // SUB SP, SP, #ResidualSize (or equivalent if ResidualSize is not encodable + // in SUB). + emitFrameOffset(*MBB, MBBI, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(-ResidualSize), TII, + MachineInstr::FrameSetup, false, false, nullptr, + EmitAsyncCFI && !HasFP, CFAOffset); + if (ResidualSize > AArch64::StackProbeMaxUnprobedStack) { + // STR XZR, [SP] + BuildMI(*MBB, MBBI, DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(AArch64::SP) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + } + } +} + +void AArch64FrameLowering::inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &MBB) const { + // Get the instructions that need to be replaced. We emit at most two of + // these. Remember them in order to avoid complications coming from the need + // to traverse the block while potentially creating more blocks. + SmallVector ToReplace; + for (MachineInstr &MI : MBB) + if (MI.getOpcode() == AArch64::PROBED_STACKALLOC || + MI.getOpcode() == AArch64::PROBED_STACKALLOC_VAR) + ToReplace.push_back(&MI); + + for (MachineInstr *MI : ToReplace) { + if (MI->getOpcode() == AArch64::PROBED_STACKALLOC) { + Register ScratchReg = MI->getOperand(0).getReg(); + int64_t FrameSize = MI->getOperand(1).getImm(); + StackOffset CFAOffset = StackOffset::get(MI->getOperand(2).getImm(), + MI->getOperand(3).getImm()); + inlineStackProbeFixed(MI->getIterator(), ScratchReg, FrameSize, + CFAOffset); + } else { + assert(MI->getOpcode() == AArch64::PROBED_STACKALLOC_VAR && + "Stack probe pseudo-instruction expected"); + const AArch64InstrInfo *TII = + MI->getMF()->getSubtarget().getInstrInfo(); + Register TargetReg = MI->getOperand(0).getReg(); + (void)TII->probedStackAlloc(MI->getIterator(), TargetReg, true); + } + MI->eraseFromParent(); + } +} diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h index f3313f3b53fffe..941af03a78b738 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -152,13 +152,26 @@ class AArch64FrameLowering : public TargetFrameLowering { MachineBasicBlock::iterator MBBI) const; void allocateStackSpace(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - bool NeedsRealignment, StackOffset AllocSize, + int64_t RealignmentPadding, StackOffset AllocSize, bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI, - StackOffset InitialOffset) const; + StackOffset InitialOffset, bool FollowupAllocs) const; /// Emit target zero call-used regs. void emitZeroCallUsedRegs(BitVector RegsToZero, MachineBasicBlock &MBB) const override; + + /// Replace a StackProbe stub (if any) with the actual probe code inline + void inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &PrologueMBB) const override; + + void inlineStackProbeFixed(MachineBasicBlock::iterator MBBI, + Register ScratchReg, int64_t FrameSize, + StackOffset CFAOffset) const; + + MachineBasicBlock::iterator + inlineStackProbeLoopExactMultiple(MachineBasicBlock::iterator MBBI, + int64_t NegProbeSize, + Register TargetReg) const; }; } // End llvm namespace diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 149a6d413d8144..cb093a1613110e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -26815,3 +26815,9 @@ unsigned AArch64TargetLowering::getVectorTypeBreakdownForCallingConv( return NumRegs; } + +bool AArch64TargetLowering::hasInlineStackProbe( + const MachineFunction &MF) const { + return !Subtarget->isTargetWindows() && + MF.getInfo()->hasStackProbing(); +} diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index c67c7c5affdc48..25d7cb6d212d1f 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -514,6 +514,13 @@ const unsigned RoundingBitsPos = 22; ArrayRef getGPRArgRegs(); ArrayRef getFPRArgRegs(); +/// Maximum allowed number of unprobed bytes above SP at an ABI +/// boundary. +const unsigned StackProbeMaxUnprobedStack = 1024; + +/// Maximum number of iterations to unroll for a constant size probing loop. +const unsigned StackProbeMaxLoopUnroll = 4; + } // namespace AArch64 class AArch64Subtarget; @@ -966,6 +973,9 @@ class AArch64TargetLowering : public TargetLowering { unsigned &NumIntermediates, MVT &RegisterVT) const override; + /// True if stack clash protection is enabled for this functions. + bool hasInlineStackProbe(const MachineFunction &MF) const override; + private: /// Keep a pointer to the AArch64Subtarget around so that we can /// make the right decision when generating code for different targets. diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index e97f17e3f49c58..72ee532d4aa6e5 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -10,8 +10,8 @@ // //===----------------------------------------------------------------------===// -#include "AArch64ExpandImm.h" #include "AArch64InstrInfo.h" +#include "AArch64ExpandImm.h" #include "AArch64FrameLowering.h" #include "AArch64MachineFunctionInfo.h" #include "AArch64PointerAuth.h" @@ -21,6 +21,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineCombinerPattern.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -9461,6 +9462,94 @@ bool AArch64InstrInfo::isReallyTriviallyReMaterializable( return TargetInstrInfo::isReallyTriviallyReMaterializable(MI); } +MachineBasicBlock::iterator +AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI, + Register TargetReg, bool FrameSetup) const { + assert(TargetReg != AArch64::SP && "New top of stack cannot aleady be in SP"); + + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineFunction &MF = *MBB.getParent(); + const AArch64InstrInfo *TII = + MF.getSubtarget().getInstrInfo(); + int64_t ProbeSize = MF.getInfo()->getStackProbeSize(); + DebugLoc DL = MBB.findDebugLoc(MBBI); + + MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator()); + MachineBasicBlock *LoopTestMBB = + MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, LoopTestMBB); + MachineBasicBlock *LoopBodyMBB = + MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, LoopBodyMBB); + MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, ExitMBB); + MachineInstr::MIFlag Flags = + FrameSetup ? MachineInstr::FrameSetup : MachineInstr::NoFlags; + + // LoopTest: + // SUB SP, SP, #ProbeSize + emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP, + AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags); + + // CMP SP, TargetReg + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64), + AArch64::XZR) + .addReg(AArch64::SP) + .addReg(TargetReg) + .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0)) + .setMIFlags(Flags); + + // B. LoopExit + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc)) + .addImm(AArch64CC::LE) + .addMBB(ExitMBB) + .setMIFlags(Flags); + + // STR XZR, [SP] + BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(AArch64::SP) + .addImm(0) + .setMIFlags(Flags); + + // B loop + BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B)) + .addMBB(LoopTestMBB) + .setMIFlags(Flags); + + // LoopExit: + // MOV SP, TargetReg + BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP) + .addReg(TargetReg) + .addImm(0) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) + .setMIFlags(Flags); + + // STR XZR, [SP] + BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(AArch64::SP) + .addImm(0) + .setMIFlags(Flags); + + ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end()); + ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB); + + LoopTestMBB->addSuccessor(ExitMBB); + LoopTestMBB->addSuccessor(LoopBodyMBB); + LoopBodyMBB->addSuccessor(LoopTestMBB); + MBB.addSuccessor(LoopTestMBB); + + // Update liveins. + if (MF.getRegInfo().reservedRegsFrozen()) { + recomputeLiveIns(*LoopTestMBB); + recomputeLiveIns(*LoopBodyMBB); + recomputeLiveIns(*ExitMBB); + } + + return ExitMBB->begin(); +} + #define GET_INSTRINFO_HELPERS #define GET_INSTRMAP_INFO #include "AArch64GenInstrInfo.inc" diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index cc588cdad6b8e5..f780cd42c63ce1 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -383,6 +383,13 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo { bool isLegalAddressingMode(unsigned NumBytes, int64_t Offset, unsigned Scale) const; + // Decrement the SP, issuing probes along the way. `TargetReg` is the new top + // of the stack. `FrameSetup` is passed as true, if the allocation is a part + // of constructing the activation frame of a function. + MachineBasicBlock::iterator probedStackAlloc(MachineBasicBlock::iterator MBBI, + Register TargetReg, + bool FrameSetup) const; + #define GET_INSTRINFO_HELPER_DECLS #include "AArch64GenInstrInfo.inc" diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index b94bc101dc6026..60cd94f4ff8e12 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -936,7 +936,8 @@ include "SMEInstrFormats.td" // Miscellaneous instructions. //===----------------------------------------------------------------------===// -let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in { +let hasSideEffects = 1, isCodeGenOnly = 1 in { +let Defs = [SP], Uses = [SP] in { // We set Sched to empty list because we expect these instructions to simply get // removed in most cases. def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), @@ -945,7 +946,27 @@ def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), [(AArch64callseq_end timm:$amt1, timm:$amt2)]>, Sched<[]>; -} // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 + +} + +let Defs = [SP, NZCV], Uses = [SP] in { +// Probed stack allocation of a constant size, used in function prologues when +// stack-clash protection is enabled. +def PROBED_STACKALLOC : Pseudo<(outs GPR64:$scratch), + (ins i64imm:$stacksize, i64imm:$fixed_offset, + i64imm:$scalable_offset), + []>, + Sched<[]>; + +// Probed stack allocation of a variable size, used in function prologues when +// stack-clash protection is enabled. +def PROBED_STACKALLOC_VAR : Pseudo<(outs), + (ins GPR64sp:$target), + []>, + Sched<[]>; + +} // Defs = [SP, NZCV], Uses = [SP] in +} // hasSideEffects = 1, isCodeGenOnly = 1 let isReMaterializable = 1, isCodeGenOnly = 1 in { // FIXME: The following pseudo instructions are only needed because remat diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp index 7bb5041b8ba948..f8b73ba6dda3c2 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp @@ -97,14 +97,45 @@ AArch64FunctionInfo::AArch64FunctionInfo(const Function &F, if (const auto *BTE = mdconst::extract_or_null( F.getParent()->getModuleFlag("branch-target-enforcement"))) BranchTargetEnforcement = BTE->getZExtValue(); - return; + } else { + const StringRef BTIEnable = + F.getFnAttribute("branch-target-enforcement").getValueAsString(); + assert(BTIEnable.equals_insensitive("true") || + BTIEnable.equals_insensitive("false")); + BranchTargetEnforcement = BTIEnable.equals_insensitive("true"); } - const StringRef BTIEnable = - F.getFnAttribute("branch-target-enforcement").getValueAsString(); - assert(BTIEnable.equals_insensitive("true") || - BTIEnable.equals_insensitive("false")); - BranchTargetEnforcement = BTIEnable.equals_insensitive("true"); + // The default stack probe size is 4096 if the function has no + // stack-probe-size attribute. This is a safe default because it is the + // smallest possible guard page size. + uint64_t ProbeSize = 4096; + if (F.hasFnAttribute("stack-probe-size")) + ProbeSize = F.getFnAttributeAsParsedInteger("stack-probe-size"); + else if (const auto *PS = mdconst::extract_or_null( + F.getParent()->getModuleFlag("stack-probe-size"))) + ProbeSize = PS->getZExtValue(); + assert(int64_t(ProbeSize) > 0 && "Invalid stack probe size"); + + if (STI->isTargetWindows()) { + if (!F.hasFnAttribute("no-stack-arg-probe")) + StackProbeSize = ProbeSize; + } else { + // Round down to the stack alignment. + uint64_t StackAlign = + STI->getFrameLowering()->getTransientStackAlign().value(); + ProbeSize = std::max(StackAlign, ProbeSize & ~(StackAlign - 1U)); + StringRef ProbeKind; + if (F.hasFnAttribute("probe-stack")) + ProbeKind = F.getFnAttribute("probe-stack").getValueAsString(); + else if (const auto *PS = dyn_cast_or_null( + F.getParent()->getModuleFlag("probe-stack"))) + ProbeKind = PS->getString(); + if (ProbeKind.size()) { + if (ProbeKind != "inline-asm") + report_fatal_error("Unsupported stack probing method"); + StackProbeSize = ProbeSize; + } + } } MachineFunctionInfo *AArch64FunctionInfo::clone( diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 0b8bfb04a572c7..219f83cfd32e0e 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -194,6 +194,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { /// True if the function need asynchronous unwind information. mutable std::optional NeedsAsyncDwarfUnwindInfo; + int64_t StackProbeSize = 0; + public: AArch64FunctionInfo(const Function &F, const AArch64Subtarget *STI); @@ -456,6 +458,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { HasStreamingModeChanges = HasChanges; } + bool hasStackProbing() const { return StackProbeSize != 0; } + + int64_t getStackProbeSize() const { return StackProbeSize; } + private: // Hold the lists of LOHs. MILOHContainer LOHContainerSet; diff --git a/llvm/test/CodeGen/AArch64/stack-probing-64k.ll b/llvm/test/CodeGen/AArch64/stack-probing-64k.ll new file mode 100644 index 00000000000000..945c271d375001 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/stack-probing-64k.ll @@ -0,0 +1,392 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false | FileCheck %s +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false -global-isel | FileCheck %s + +; Tests for prolog sequences for stack probing, when using a 64KiB stack guard. + +; 64k bytes is the largest frame we can probe in one go. +define void @static_65536(ptr %out) #0 { +; CHECK-LABEL: static_65536: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 65552 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 65536, align 1 + store i8* %v, ptr %out, align 8 + ret void +} + +; 64k+16 bytes, still needs just one probe. +define void @static_65552(ptr %out) #0 { +; CHECK-LABEL: static_65552: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 65552 +; CHECK-NEXT: str xzr, [sp], #-16 +; CHECK-NEXT: .cfi_def_cfa_offset 65568 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 65552, align 1 + store i8* %v, ptr %out, align 8 + ret void +} + +; 64k+1024 bytes, the largest frame which needs just one probe. +define void @static_66560(ptr %out) #0 { +; CHECK-LABEL: static_66560: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 65552 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 66576 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 1040 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 66560, align 1 + store i8* %v, ptr %out, align 8 + ret void +} + +; 64k+1024+16 bytes, the smallest frame which needs two probes. +define void @static_66576(ptr %out) #0 { +; CHECK-LABEL: static_66576: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 65552 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 66592 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 1056 +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 66576, align 1 + store i8* %v, ptr %out, align 8 + ret void +} + +; 2*64k+1024, the largest frame needing two probes. +define void @static_132096(ptr %out) #0 { +; CHECK-LABEL: static_132096: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 65552 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 131088 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 132112 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #32, lsl #12 // =131072 +; CHECK-NEXT: .cfi_def_cfa_offset 1040 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 132096, align 1 + store i8* %v, ptr %out, align 8 + ret void +} + +; 5*64k-16, the largest frame probed without a loop. +define void @static_327664(ptr %out) #0 { +; CHECK-LABEL: static_327664: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 65552 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 131088 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 196624 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 262160 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #15, lsl #12 // =61440 +; CHECK-NEXT: .cfi_def_cfa_offset 323600 +; CHECK-NEXT: sub sp, sp, #4080 +; CHECK-NEXT: .cfi_def_cfa_offset 327680 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #79, lsl #12 // =323584 +; CHECK-NEXT: .cfi_def_cfa_offset 4096 +; CHECK-NEXT: add sp, sp, #4080 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 327664, align 1 + store i8* %v, ptr %out, align 8 + ret void +} + +; 5*64k, smallest frame probed with a loop. +define void @static_327680(ptr %out) #0 { +; CHECK-LABEL: static_327680: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #80, lsl #12 // =327680 +; CHECK-NEXT: .cfi_def_cfa w9, 327696 +; CHECK-NEXT: .LBB6_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.ne .LBB6_1 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #80, lsl #12 // =327680 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 327680, align 1 + store i8* %v, ptr %out, align 8 + ret void +} + +; 5*64k+1024, large enough to use a loop, but not a multiple of 64KiB +; so has a reminder, but no extra probe. +define void @static_328704(ptr %out) #0 { +; CHECK-LABEL: static_328704: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #80, lsl #12 // =327680 +; CHECK-NEXT: .cfi_def_cfa w9, 327696 +; CHECK-NEXT: .LBB7_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.ne .LBB7_1 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 328720 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #80, lsl #12 // =327680 +; CHECK-NEXT: .cfi_def_cfa_offset 1040 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 328704, align 1 + store i8* %v, ptr %out, align 8 + ret void +} + +; 5*64k+1040, large enough to use a loop, has a reminder and +; an extra probe. +define void @static_328720(ptr %out) #0 { +; CHECK-LABEL: static_328720: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #80, lsl #12 // =327680 +; CHECK-NEXT: .cfi_def_cfa w9, 327696 +; CHECK-NEXT: .LBB8_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.ne .LBB8_1 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 328736 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #80, lsl #12 // =327680 +; CHECK-NEXT: .cfi_def_cfa_offset 1056 +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 328720, align 1 + store i8* %v, ptr %out, align 8 + ret void +} + +; A small allocation, but with a very large alignment requirement. We do this +; by moving SP far enough that a sufficiently-aligned block will exist +; somewhere in the stack frame, so must probe the whole of that larger SP move. +define void @static_16_align_131072(ptr %out) #0 { +; CHECK-LABEL: static_16_align_131072: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #31, lsl #12 // =126976 +; CHECK-NEXT: sub x9, x9, #4080 +; CHECK-NEXT: and x9, x9, #0xfffffffffffe0000 +; CHECK-NEXT: .LBB9_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB9_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB9_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB9_1 +; CHECK-NEXT: .LBB9_3: // %entry +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 16, align 131072 + store i8* %v, ptr %out, align 8 + ret void +} + +; A small allocation, but with a very large alignment requirement which +; is nevertheless small enough as to not need a loop. +define void @static_16_align_8192(ptr %out) #0 { +; CHECK-LABEL: static_16_align_8192: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: sub x9, x9, #4080 +; CHECK-NEXT: and sp, x9, #0xffffffffffffe000 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 16, align 8192 + store i8* %v, ptr %out, align 8 + ret void +} + +; A large allocation with a very large alignment requirement which +; is nevertheless small enough as to not need a loop. +define void @static_32752_align_32k(ptr %out) #0 { +; CHECK-LABEL: static_32752_align_32k: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #7, lsl #12 // =28672 +; CHECK-NEXT: sub x9, x9, #4080 +; CHECK-NEXT: and sp, x9, #0xffffffffffff8000 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 32752, align 32768 + store i8* %v, ptr %out, align 8 + ret void +} + +attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" } diff --git a/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir b/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir new file mode 100644 index 00000000000000..6c8ec7e4c4fa92 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir @@ -0,0 +1,146 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 +# RUN: llc -run-pass=prologepilog %s -o - | FileCheck %s +# Regression test for a crash when the probing instruction +# to replace is last in the block. +--- | + source_filename = "tt.ll" + target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + target triple = "aarch64-linux" + + declare i1 @g(ptr) + + define void @f(ptr %out) #0 { + entry: + %p = alloca i32, i32 50000, align 4 + br label %loop + + loop: ; preds = %loop, %entry + %c = call i1 @g(ptr %p) + br i1 %c, label %loop, label %exit + + exit: ; preds = %loop + ret void + } + + attributes #0 = { uwtable "frame-pointer"="none" "probe-stack"="inline-asm" "target-features"="+sve" } + +... +--- +name: f +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +isOutlined: false +debugInstrRef: false +failsVerification: false +tracksDebugUserValues: true +registers: [] +liveins: [] +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: true + hasCalls: true + stackProtector: '' + functionContext: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + localFrameSize: 200000 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: p, type: default, offset: 0, size: 200000, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: -200000, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } +entry_values: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: f + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: liveins: $lr, $fp + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store (s64) into %stack.2), (store (s64) into %stack.1) + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 + ; CHECK-NEXT: $x9 = frame-setup SUBXri $sp, 48, 12 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w9, 196624 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.entry: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: liveins: $x9 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1, 12 + ; CHECK-NEXT: frame-setup STRXui $xzr, $sp, 0 + ; CHECK-NEXT: $xzr = frame-setup SUBSXrx64 $sp, $x9, 24, implicit-def $nzcv + ; CHECK-NEXT: frame-setup Bcc 1, %bb.3, implicit $nzcv + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4.entry: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_register $wsp + ; CHECK-NEXT: $sp = frame-setup SUBXri $sp, 3392, 0 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 200016 + ; CHECK-NEXT: frame-setup STRXui $xzr, $sp, 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.loop: + ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x0 = ADDXri $sp, 0, 0 + ; CHECK-NEXT: BL @g, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $w0 + ; CHECK-NEXT: TBNZW killed renamable $w0, 0, %bb.1 + ; CHECK-NEXT: B %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.exit: + ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 48, 12 + ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 3408 + ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 3392, 0 + ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 16 + ; CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.2), (load (s64) from %stack.1) + ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0 + ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w30 + ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29 + ; CHECK-NEXT: RET_ReallyLR + bb.0.entry: + successors: %bb.1(0x80000000) + + + bb.1.loop: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + $x0 = ADDXri %stack.0.p, 0, 0 + BL @g, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $w0 + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + TBNZW killed renamable $w0, 0, %bb.1 + B %bb.2 + + bb.2.exit: + RET_ReallyLR + +... diff --git a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll new file mode 100644 index 00000000000000..4dad104e66f20d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll @@ -0,0 +1,724 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s + +; Test prolog sequences for stack probing when SVE objects are involved. + +; The space for SVE objects needs probing in the general case, because +; the stack adjustment may happen to be too big (i.e. greater than the +; probe size) to allocate with a single `addvl`. +; When we do know that the stack adjustment cannot exceed the probe size +; we can avoid emitting a probe loop and emit a simple `addvl; str` +; sequence instead. + +define void @sve_1_vector(ptr %out) #0 { +; CHECK-LABEL: sve_1_vector: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec = alloca , align 16 + ret void +} + +; As above, but with 4 SVE vectors of stack space. +define void @sve_4_vector(ptr %out) #0 { +; CHECK-LABEL: sve_4_vector: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG +; CHECK-NEXT: addvl sp, sp, #4 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec1 = alloca , align 16 + %vec2 = alloca , align 16 + %vec3 = alloca , align 16 + %vec4 = alloca , align 16 + ret void +} + +; As above, but with 16 SVE vectors of stack space. +; The stack adjustment is less than or equal to 16 x 256 = 4096, so +; we can allocate the locals at once. +define void @sve_16_vector(ptr %out) #0 { +; CHECK-LABEL: sve_16_vector: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-16 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: addvl sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec1 = alloca , align 16 + %vec2 = alloca , align 16 + %vec3 = alloca , align 16 + %vec4 = alloca , align 16 + %vec5 = alloca , align 16 + %vec6 = alloca , align 16 + %vec7 = alloca , align 16 + %vec8 = alloca , align 16 + %vec9 = alloca , align 16 + %vec10 = alloca , align 16 + %vec11 = alloca , align 16 + %vec12 = alloca , align 16 + %vec13 = alloca , align 16 + %vec14 = alloca , align 16 + %vec15 = alloca , align 16 + %vec16 = alloca , align 16 + ret void +} + +; As above, but with 17 SVE vectors of stack space. Now we need +; a probing loops since stack adjustment may be greater than +; the probe size (17 x 256 = 4354 bytes) +; TODO: Allocating `k*16+r` SVE vectors can be unrolled into +; emiting the `k + r` sequences of `addvl sp, sp, #-N; str xzr, [sp]` +define void @sve_17_vector(ptr %out) #0 { +; CHECK-LABEL: sve_17_vector: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl x9, sp, #-17 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 136 * VG +; CHECK-NEXT: .LBB3_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB3_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB3_1 +; CHECK-NEXT: .LBB3_3: // %entry +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: addvl sp, sp, #17 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec1 = alloca , align 16 + %vec2 = alloca , align 16 + %vec3 = alloca , align 16 + %vec4 = alloca , align 16 + %vec5 = alloca , align 16 + %vec6 = alloca , align 16 + %vec7 = alloca , align 16 + %vec8 = alloca , align 16 + %vec9 = alloca , align 16 + %vec10 = alloca , align 16 + %vec11 = alloca , align 16 + %vec12 = alloca , align 16 + %vec13 = alloca , align 16 + %vec14 = alloca , align 16 + %vec15 = alloca , align 16 + %vec16 = alloca , align 16 + %vec17 = alloca , align 16 + ret void +} + +; Space for callee-saved SVE register is allocated similarly to allocating +; space for SVE locals. When we know the stack adjustment cannot exceed the +; probe size we can skip the explict probe, since saving SVE registers serves +; as an implicit probe. +define void @sve_1v_csr( %a) #0 { +; CHECK-LABEL: sve_1v_csr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: .cfi_restore z8 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + call void asm sideeffect "", "~{z8}" () + ret void +} + +define void @sve_4v_csr( %a) #0 { +; CHECK-LABEL: sve_4v_csr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG +; CHECK-NEXT: str z11, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr z11, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #4 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: .cfi_restore z8 +; CHECK-NEXT: .cfi_restore z9 +; CHECK-NEXT: .cfi_restore z10 +; CHECK-NEXT: .cfi_restore z11 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + call void asm sideeffect "", "~{z8},~{z9},~{z10},~{z11}" () + ret void +} + +define void @sve_16v_csr( %a) #0 { +; CHECK-LABEL: sve_16v_csr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-16 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: .cfi_restore z8 +; CHECK-NEXT: .cfi_restore z9 +; CHECK-NEXT: .cfi_restore z10 +; CHECK-NEXT: .cfi_restore z11 +; CHECK-NEXT: .cfi_restore z12 +; CHECK-NEXT: .cfi_restore z13 +; CHECK-NEXT: .cfi_restore z14 +; CHECK-NEXT: .cfi_restore z15 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + call void asm sideeffect "", "~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23}" () + ret void +} + +define void @sve_1p_csr( %a) #0 { +; CHECK-LABEL: sve_1p_csr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + call void asm sideeffect "", "~{p8}" () + ret void +} + +define void @sve_4p_csr( %a) #0 { +; CHECK-LABEL: sve_4p_csr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: str p11, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p10, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p9, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr p11, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + call void asm sideeffect "", "~{p8},~{p9},~{p10},~{p11}" () + ret void +} + +define void @sve_16v_1p_csr( %a) #0 { +; CHECK-LABEL: sve_16v_1p_csr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl x9, sp, #-17 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 136 * VG +; CHECK-NEXT: .LBB9_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB9_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB9_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB9_1 +; CHECK-NEXT: .LBB9_3: // %entry +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #17 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: .cfi_restore z8 +; CHECK-NEXT: .cfi_restore z9 +; CHECK-NEXT: .cfi_restore z10 +; CHECK-NEXT: .cfi_restore z11 +; CHECK-NEXT: .cfi_restore z12 +; CHECK-NEXT: .cfi_restore z13 +; CHECK-NEXT: .cfi_restore z14 +; CHECK-NEXT: .cfi_restore z15 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + call void asm sideeffect "", "~{p8},~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23}" () + ret void +} + +; A SVE vector and a 16-byte fixed size object. +define void @sve_1_vector_16_arr(ptr %out) #0 { +; CHECK-LABEL: sve_1_vector_16_arr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 8 * VG +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: .cfi_def_cfa wsp, 32 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec = alloca , align 16 + %arr = alloca i8, i64 16, align 1 + ret void +} + +; A large SVE stack object and a large stack slot, both of which need probing. +; TODO: This could be optimised by combining the fixed-size offset into the +; loop. +define void @sve_1_vector_4096_arr(ptr %out) #0 { +; CHECK-LABEL: sve_1_vector_4096_arr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #3, lsl #12 // =12288 +; CHECK-NEXT: .cfi_def_cfa w9, 12304 +; CHECK-NEXT: addvl x9, x9, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0f, 0x79, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 12304 + 256 * VG +; CHECK-NEXT: addvl x9, x9, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0f, 0x79, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 12304 + 512 * VG +; CHECK-NEXT: .LBB11_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB11_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB11_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB11_1 +; CHECK-NEXT: .LBB11_3: // %entry +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0f, 0x8f, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x88, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 12304 + 264 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 12304 + 16 * VG +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: .cfi_def_cfa wsp, 12304 +; CHECK-NEXT: add sp, sp, #3, lsl #12 // =12288 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec = alloca , align 16 + %arr = alloca i8, i64 12288, align 1 + ret void +} + +; Not tested: SVE stack objects with alignment >16 bytes, which isn't currently +; supported even without stack-probing. + +; An SVE vector, and a 16-byte fixed size object, which +; has a large alignment requirement. +define void @sve_1_vector_16_arr_align_8192(ptr %out) #0 { +; CHECK-LABEL: sve_1_vector_16_arr_align_8192: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: sub x9, x9, #4080 +; CHECK-NEXT: addvl x9, x9, #-1 +; CHECK-NEXT: and x9, x9, #0xffffffffffffe000 +; CHECK-NEXT: .LBB12_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB12_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB12_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB12_1 +; CHECK-NEXT: .LBB12_3: // %entry +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec = alloca , align 16 + %arr = alloca i8, i64 16, align 8192 + ret void +} + +; With 64k guard pages, we can allocate bigger SVE space without a probing loop. +define void @sve_1024_64k_guard(ptr %out) #0 "stack-probe-size"="65536" { +; CHECK-LABEL: sve_1024_64k_guard: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 256 * VG +; CHECK-NEXT: addvl sp, sp, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 512 * VG +; CHECK-NEXT: addvl sp, sp, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 768 * VG +; CHECK-NEXT: addvl sp, sp, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1024 * VG +; CHECK-NEXT: addvl sp, sp, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1280 * VG +; CHECK-NEXT: addvl sp, sp, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1536 * VG +; CHECK-NEXT: addvl sp, sp, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1792 * VG +; CHECK-NEXT: addvl sp, sp, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 2048 * VG +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1800 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1552 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x98, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1304 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1056 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa8, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 808 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb0, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 560 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb8, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 312 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG +; CHECK-NEXT: addvl sp, sp, #8 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec = alloca , align 16 + ret void +} + +define void @sve_1028_64k_guard(ptr %out) #0 "stack-probe-size"="65536" { +; CHECK-LABEL: sve_1028_64k_guard: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl x9, sp, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 256 * VG +; CHECK-NEXT: addvl x9, x9, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 512 * VG +; CHECK-NEXT: addvl x9, x9, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 768 * VG +; CHECK-NEXT: addvl x9, x9, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1024 * VG +; CHECK-NEXT: addvl x9, x9, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1280 * VG +; CHECK-NEXT: addvl x9, x9, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1536 * VG +; CHECK-NEXT: addvl x9, x9, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1792 * VG +; CHECK-NEXT: addvl x9, x9, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 2048 * VG +; CHECK-NEXT: addvl x9, x9, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 2056 * VG +; CHECK-NEXT: .LBB14_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB14_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB14_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB14_1 +; CHECK-NEXT: .LBB14_3: // %entry +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1808 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x98, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1560 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1312 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa8, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1064 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb0, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 816 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb8, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 568 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 320 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc8, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 72 * VG +; CHECK-NEXT: addvl sp, sp, #9 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec = alloca , align 16 + %vec1 = alloca , align 16 + ret void +} + +; With 5 SVE vectors of stack space the unprobed area +; at the top of the stack can exceed 1024 bytes (5 x 256 == 1280), +; hence we need to issue a probe. +define void @sve_5_vector(ptr %out) #0 { +; CHECK-LABEL: sve_5_vector: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-5 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 40 * VG +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: addvl sp, sp, #5 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec1 = alloca , align 16 + %vec2 = alloca , align 16 + %vec3 = alloca , align 16 + %vec4 = alloca , align 16 + %vec5 = alloca , align 16 + ret void +} + +; Test with a 14 scalable bytes (so up to 14 * 16 = 224) of unprobed +; are bellow the save location of `p9`. +define void @sve_unprobed_area( %a, i32 %n) #0 { +; CHECK-LABEL: sve_unprobed_area: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: str p9, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: addvl sp, sp, #4 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG +; CHECK-NEXT: ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #4 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: .cfi_restore z8 +; CHECK-NEXT: .cfi_restore z9 +; CHECK-NEXT: .cfi_restore z10 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + call void asm sideeffect "", "~{z8},~{z9},~{z10},~{p9}" () + + %v0 = alloca , align 16 + %v1 = alloca , align 16 + %v2 = alloca , align 16 + %v3 = alloca , align 16 + + ret void +} + +attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/stack-probing.ll b/llvm/test/CodeGen/AArch64/stack-probing.ll new file mode 100644 index 00000000000000..5c5d9321a56e58 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/stack-probing.ll @@ -0,0 +1,539 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false | FileCheck %s +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false -global-isel | FileCheck %s + +; Tests for prolog sequences for stack probing, when using a 4KiB stack guard. + +; The stack probing parameters in function attributes take precedence over +; ones in the module flags. + +; Small stack frame, no probing required. +define void @static_64(ptr %out) #0 { +; CHECK-LABEL: static_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #64 +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #64 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 64, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; At 256 bytes we start to always create a frame pointer. No frame smaller then +; this needs a probe, so we can use the saving of at least one CSR as a probe +; at the top of our frame. +define void @static_256(ptr %out) #0 { +; CHECK-LABEL: static_256: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #272 +; CHECK-NEXT: .cfi_def_cfa_offset 272 +; CHECK-NEXT: str x29, [sp, #256] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #272 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 256, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; At 1024 bytes, this is the largest frame which doesn't need probing. +define void @static_1024(ptr %out) #0 { +; CHECK-LABEL: static_1024: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 1040 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 1024, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; At 1024+16 bytes, this is the smallest frame which needs probing. +define void @static_1040(ptr %out) #0 { +; CHECK-LABEL: static_1040: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 1056 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 1040, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; 4k bytes is the largest frame we can probe in one go. +define void @static_4096(ptr %out) #0 { +; CHECK-LABEL: static_4096: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 4112 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 4096, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; 4k+16 bytes, still needs just one probe. +define void @static_4112(ptr %out) #0 { +; CHECK-LABEL: static_4112: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 4112 +; CHECK-NEXT: str xzr, [sp], #-16 +; CHECK-NEXT: .cfi_def_cfa_offset 4128 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 4112, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; 4k+1024 bytes, the largest frame which needs just one probe. +define void @static_5120(ptr %out) #0 { +; CHECK-LABEL: static_5120: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 4112 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 5136 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 1040 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 5120, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; 4k+1024+16, the smallest frame which needs two probes. +define void @static_5136(ptr %out) #0 { +; CHECK-LABEL: static_5136: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 4112 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 5152 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 1056 +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 5136, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; 2*4k+1024, the largest frame needing two probes +define void @static_9216(ptr %out) #0 { +; CHECK-LABEL: static_9216: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 4112 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 8208 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 9232 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #2, lsl #12 // =8192 +; CHECK-NEXT: .cfi_def_cfa_offset 1040 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 9216, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; 5*4k-16, the largest frame probed without a loop +define void @static_20464(ptr %out) #0 { +; CHECK-LABEL: static_20464: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 4112 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 8208 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 12304 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 16400 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #4080 +; CHECK-NEXT: .cfi_def_cfa_offset 20480 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #4, lsl #12 // =16384 +; CHECK-NEXT: .cfi_def_cfa_offset 4096 +; CHECK-NEXT: add sp, sp, #4080 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 20464, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; 5*4k, the smallest frame probed with a loop +define void @static_20480(ptr %out) #0 { +; CHECK-LABEL: static_20480: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #5, lsl #12 // =20480 +; CHECK-NEXT: .cfi_def_cfa w9, 20496 +; CHECK-NEXT: .LBB10_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.ne .LBB10_1 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #5, lsl #12 // =20480 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 20480, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; 5*4k + 1024, large enough to use a loop, but not a multiple of 4KiB +; so has a reminder, but no extra probe. +define void @static_21504(ptr %out) #0 { +; CHECK-LABEL: static_21504: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #5, lsl #12 // =20480 +; CHECK-NEXT: .cfi_def_cfa w9, 20496 +; CHECK-NEXT: .LBB11_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.ne .LBB11_1 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 21520 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #5, lsl #12 // =20480 +; CHECK-NEXT: .cfi_def_cfa_offset 1040 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 21504, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; 5*4k+1040, large enough to use a loop, has a reminder and +; an extra probe. +define void @static_21520(ptr %out) #0 { +; CHECK-LABEL: static_21520: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #5, lsl #12 // =20480 +; CHECK-NEXT: .cfi_def_cfa w9, 20496 +; CHECK-NEXT: .LBB12_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.ne .LBB12_1 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 21536 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #5, lsl #12 // =20480 +; CHECK-NEXT: .cfi_def_cfa_offset 1056 +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 21520, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; A small allocation, but with a very large alignment requirement. We do this +; by moving SP far enough that a sufficiently-aligned block will exist +; somewhere in the stack frame, so must probe the whole of that larger SP move. +define void @static_16_align_8192(ptr %out) #0 { +; CHECK-LABEL: static_16_align_8192: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: sub x9, x9, #4080 +; CHECK-NEXT: and x9, x9, #0xffffffffffffe000 +; CHECK-NEXT: .LBB13_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB13_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB13_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB13_1 +; CHECK-NEXT: .LBB13_3: // %entry +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 16, align 8192 + store ptr %v, ptr %out, align 8 + ret void +} + +; A small allocation with a very large alignment requirement, but +; nevertheless small enough as to not need a loop. +define void @static_16_align_2048(ptr %out) #0 { +; CHECK-LABEL: static_16_align_2048: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #2032 +; CHECK-NEXT: and sp, x9, #0xfffffffffffff800 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 16, align 2048 + store ptr %v, ptr %out, align 8 + ret void +} + +; A large(-ish) allocation with a very large alignment requirement, but +; nevertheless small enough as to not need a loop. +define void @static_2032_align_2048(ptr %out) #0 { +; CHECK-LABEL: static_2032_align_2048: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #2032 +; CHECK-NEXT: and sp, x9, #0xfffffffffffff800 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 2032, align 2048 + store ptr %v, ptr %out, align 8 + ret void +} + +; Test stack probing is enabled by module flags +define void @static_9232(ptr %out) uwtable(async) { +; CHECK-LABEL: static_9232: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #2, lsl #12 // =8192 +; CHECK-NEXT: .cfi_def_cfa_offset 8208 +; CHECK-NEXT: sub sp, sp, #800 +; CHECK-NEXT: .cfi_def_cfa_offset 9008 +; CHECK-NEXT: str xzr, [sp], #-240 +; CHECK-NEXT: .cfi_def_cfa_offset 9248 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #2, lsl #12 // =8192 +; CHECK-NEXT: .cfi_def_cfa_offset 1056 +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 9232, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; Test for a tight upper bound on the amount of stack adjustment +; due to stack realignment. No probes should appear. +define void @static_1008(ptr %out) #0 { +; CHECK-LABEL: static_1008: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #1008 +; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i32 1008, align 32 + store ptr %v, ptr %out, align 8 + ret void +} + +attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "stack-probe-size"="4096" "frame-pointer"="none" } + +!llvm.module.flags = !{!0, !1} + +!0 = !{i32 4, !"probe-stack", !"inline-asm"} +!1 = !{i32 8, !"stack-probe-size", i32 9000}