From b1806e6a1f0589acc88499419531c4eb82488f1a Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Sat, 2 Dec 2023 10:09:41 +0000 Subject: [PATCH] [AArch64] Stack probing for dynamic allocas in SelectionDAG (#66525) Add support for probing for dynamic allocas (variable-size objects and outgoing stack arguments). Co-authored-by: Oliver Stannard --- .../Target/AArch64/AArch64FrameLowering.cpp | 28 +- .../Target/AArch64/AArch64ISelLowering.cpp | 154 +++++--- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 13 +- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 14 + .../stack-probing-dynamic-no-frame-setup.ll | 14 + .../CodeGen/AArch64/stack-probing-dynamic.ll | 362 ++++++++++++++++++ 6 files changed, 528 insertions(+), 57 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 96702cb8b255af..e4451a9a85f50f 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -479,6 +479,11 @@ bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const { /// included as part of the stack frame. bool AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { + // The stack probing code for the dynamically allocated outgoing arguments + // area assumes that the stack is probed at the top - either by the prologue + // code, which issues a probe if `hasVarSizedObjects` return true, or by the + // most recent variable-sized object allocation. Changing the condition here + // may need to be followed up by changes to the probe issuing logic. return !MF.getFrameInfo().hasVarSizedObjects(); } @@ -487,6 +492,9 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr( MachineBasicBlock::iterator I) const { const AArch64InstrInfo *TII = static_cast(MF.getSubtarget().getInstrInfo()); + const AArch64TargetLowering *TLI = + MF.getSubtarget().getTargetLowering(); + MachineFrameInfo &MFI = MF.getFrameInfo(); DebugLoc DL = I->getDebugLoc(); unsigned Opc = I->getOpcode(); bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); @@ -513,8 +521,24 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr( // Most call frames will be allocated at the start of a function so // this is OK, but it is a limitation that needs dealing with. assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large"); - emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(Amount), TII); + + if (TLI->hasInlineStackProbe(MF) && + -Amount >= AArch64::StackProbeMaxUnprobedStack) { + // When stack probing is enabled, the decrement of SP may need to be + // probed. We only need to do this if the call site needs 1024 bytes of + // space or more, because a region smaller than that is allowed to be + // unprobed at an ABI boundary. We rely on the fact that SP has been + // probed exactly at this point, either by the prologue or most recent + // dynamic allocation. + assert(MFI.hasVarSizedObjects() && + "non-reserved call frame without var sized objects?"); + Register ScratchReg = + MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); + inlineStackProbeFixed(I, ScratchReg, -Amount, StackOffset::get(0, 0)); + } else { + emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(Amount), TII); + } } } else if (CalleePopAmount != 0) { // If the calling convention demands that the callee pops arguments from the diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 011aedeba1eb79..b6a16217dfae39 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -569,10 +569,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FSHL, MVT::i32, Custom); setOperationAction(ISD::FSHL, MVT::i64, Custom); - if (Subtarget->isTargetWindows()) - setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom); - else - setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom); // Constant pool entries setOperationAction(ISD::ConstantPool, MVT::i64, Custom); @@ -2353,6 +2350,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::CSINC) MAKE_CASE(AArch64ISD::THREAD_POINTER) MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ) + MAKE_CASE(AArch64ISD::PROBED_ALLOCA) MAKE_CASE(AArch64ISD::ABDS_PRED) MAKE_CASE(AArch64ISD::ABDU_PRED) MAKE_CASE(AArch64ISD::HADDS_PRED) @@ -2719,6 +2717,22 @@ MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet( return BB; } +MachineBasicBlock * +AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI, + MachineBasicBlock *MBB) const { + MachineFunction &MF = *MBB->getParent(); + MachineBasicBlock::iterator MBBI = MI.getIterator(); + DebugLoc DL = MBB->findDebugLoc(MBBI); + const AArch64InstrInfo &TII = + *MF.getSubtarget().getInstrInfo(); + Register TargetReg = MI.getOperand(0).getReg(); + MachineBasicBlock::iterator NextInst = + TII.probedStackAlloc(MBBI, TargetReg, false); + + MI.eraseFromParent(); + return NextInst->getParent(); +} + MachineBasicBlock * AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, @@ -2863,6 +2877,10 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( case AArch64::CATCHRET: return EmitLoweredCatchRet(MI, BB); + + case AArch64::PROBED_STACKALLOC_DYN: + return EmitDynamicProbedAlloc(MI, BB); + case AArch64::LD1_MXIPXX_H_PSEUDO_B: return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB); case AArch64::LD1_MXIPXX_H_PSEUDO_H: @@ -14052,9 +14070,34 @@ SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op, AN->getMemOperand()); } -SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC( - SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const { +SDValue +AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + // Get the inputs. + SDNode *Node = Op.getNode(); + SDValue Chain = Op.getOperand(0); + SDValue Size = Op.getOperand(1); + MaybeAlign Align = + cast(Op.getOperand(2))->getMaybeAlignValue(); + EVT VT = Node->getValueType(0); + + if (DAG.getMachineFunction().getFunction().hasFnAttribute( + "no-stack-arg-probe")) { + SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); + Chain = SP.getValue(1); + SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); + if (Align) + SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), + DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); + Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); + SDValue Ops[2] = {SP, Chain}; + return DAG.getMergeValues(Ops, dl); + } + + Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Callee = DAG.getTargetExternalSymbol(Subtarget->getChkStkName(), PtrVT, 0); @@ -14078,7 +14121,59 @@ SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC( Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size, DAG.getConstant(4, dl, MVT::i64)); - return Chain; + + SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); + Chain = SP.getValue(1); + SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); + if (Align) + SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), + DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); + Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); + + Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl); + + SDValue Ops[2] = {SP, Chain}; + return DAG.getMergeValues(Ops, dl); +} + +SDValue +AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const { + // Get the inputs. + SDNode *Node = Op.getNode(); + SDValue Chain = Op.getOperand(0); + SDValue Size = Op.getOperand(1); + + MaybeAlign Align = + cast(Op.getOperand(2))->getMaybeAlignValue(); + SDLoc dl(Op); + EVT VT = Node->getValueType(0); + + // Construct the new SP value in a GPR. + SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); + Chain = SP.getValue(1); + SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); + if (Align) + SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), + DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); + + // Set the real SP to the new value with a probing loop. + Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP); + SDValue Ops[2] = {SP, Chain}; + return DAG.getMergeValues(Ops, dl); +} + +SDValue +AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + + if (Subtarget->isTargetWindows()) + return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG); + else if (hasInlineStackProbe(MF)) + return LowerInlineDYNAMIC_STACKALLOC(Op, DAG); + else + return SDValue(); } // When x and y are extended, lower: @@ -14132,51 +14227,6 @@ SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG, return DAG.getNode(ISD::ADD, dl, VT, Add, tmp); } -SDValue -AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, - SelectionDAG &DAG) const { - assert(Subtarget->isTargetWindows() && - "Only Windows alloca probing supported"); - SDLoc dl(Op); - // Get the inputs. - SDNode *Node = Op.getNode(); - SDValue Chain = Op.getOperand(0); - SDValue Size = Op.getOperand(1); - MaybeAlign Align = - cast(Op.getOperand(2))->getMaybeAlignValue(); - EVT VT = Node->getValueType(0); - - if (DAG.getMachineFunction().getFunction().hasFnAttribute( - "no-stack-arg-probe")) { - SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); - Chain = SP.getValue(1); - SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); - if (Align) - SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), - DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); - Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); - SDValue Ops[2] = {SP, Chain}; - return DAG.getMergeValues(Ops, dl); - } - - Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); - - Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG); - - SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); - Chain = SP.getValue(1); - SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); - if (Align) - SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), - DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); - Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); - - Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl); - - SDValue Ops[2] = {SP, Chain}; - return DAG.getMergeValues(Ops, dl); -} - SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index e6d62f1704726b..3c8479e1f6e3c3 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -83,6 +83,10 @@ enum NodeType : unsigned { ADC, SBC, // adc, sbc instructions + // To avoid stack clash, allocation is performed by block and each block is + // probed. + PROBED_ALLOCA, + // Predicated instructions where inactive lanes produce undefined results. ABDS_PRED, ABDU_PRED, @@ -616,6 +620,9 @@ class AArch64TargetLowering : public TargetLowering { MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitDynamicProbedAlloc(MachineInstr &MI, + MachineBasicBlock *MBB) const; + MachineBasicBlock *EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const; @@ -1141,10 +1148,10 @@ class AArch64TargetLowering : public TargetLowering { SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain, - SDValue &Size, - SelectionDAG &DAG) const; + SDValue LowerAVG(SDValue Op, SelectionDAG &DAG, unsigned NewOp) const; SDValue LowerFixedLengthVectorIntDivideToSVE(SDValue Op, diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 935667488194a3..44b0337fe78791 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -863,6 +863,12 @@ def AArch64stilp : SDNode<"AArch64ISD::STILP", SDT_AArch64stilp, [SDNPHasChain, def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>; + +def AArch64probedalloca + : SDNode<"AArch64ISD::PROBED_ALLOCA", + SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>, + [SDNPHasChain, SDNPMayStore]>; + def AArch64mrs : SDNode<"AArch64ISD::MRS", SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, i32>]>, [SDNPHasChain, SDNPOutGlue]>; @@ -963,6 +969,14 @@ def PROBED_STACKALLOC_VAR : Pseudo<(outs), []>, Sched<[]>; +// Probed stack allocations of a variable size, used for allocas of unknown size +// when stack-clash protection is enabled. +let usesCustomInserter = 1 in +def PROBED_STACKALLOC_DYN : Pseudo<(outs), + (ins GPR64common:$target), + [(AArch64probedalloca GPR64common:$target)]>, + Sched<[]>; + } // Defs = [SP, NZCV], Uses = [SP] in } // hasSideEffects = 1, isCodeGenOnly = 1 diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll new file mode 100644 index 00000000000000..96f2f63d703c7d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll @@ -0,0 +1,14 @@ +; RUN: llc --stop-after=finalize-isel -o - | FileCheck %s +target triple = "aarch64-linux" + +; Check dynamic stack allocation and probing instructions do not have +; the FrameSetup flag. + +; CHECK-NOT: frame-setup +define void @no_frame_setup(i64 %size, ptr %out) #0 { + %v = alloca i8, i64 %size, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" } diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll new file mode 100644 index 00000000000000..78b83099c7ed9f --- /dev/null +++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll @@ -0,0 +1,362 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s + +; Dynamically-sized allocation, needs a loop which can handle any size at +; runtime. The final iteration of the loop will temporarily put SP below the +; target address, but this doesn't break any of the ABI constraints on the +; stack, and also doesn't probe below the target SP value. +define void @dynamic(i64 %size, ptr %out) #0 { +; CHECK-LABEL: dynamic: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: add x9, x0, #15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: .LBB0_1: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x8 +; CHECK-NEXT: b.le .LBB0_3 +; CHECK-NEXT: // %bb.2: // in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB0_1 +; CHECK-NEXT: .LBB0_3: +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: str x8, [x1] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret + %v = alloca i8, i64 %size, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; This function has a fixed-size stack slot and a dynamic one. The fixed size +; slot isn't large enough that we would normally probe it, but we need to do so +; here otherwise the gap between the CSR save and the first probe of the +; dynamic allocation could be too far apart when the size of the dynamic +; allocation is close to the guard size. +define void @dynamic_fixed(i64 %size, ptr %out1, ptr %out2) #0 { +; CHECK-LABEL: dynamic_fixed: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: str xzr, [sp, #-64]! +; CHECK-NEXT: add x9, x0, #15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: sub x10, x29, #64 +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: str x10, [x1] +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: .LBB1_1: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x8 +; CHECK-NEXT: b.le .LBB1_3 +; CHECK-NEXT: // %bb.2: // in Loop: Header=BB1_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB1_1 +; CHECK-NEXT: .LBB1_3: +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: str x8, [x2] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret + %v1 = alloca i8, i64 64, align 1 + store ptr %v1, ptr %out1, align 8 + %v2 = alloca i8, i64 %size, align 1 + store ptr %v2, ptr %out2, align 8 + ret void +} + +; Dynamic allocation, with an alignment requirement greater than the alignment +; of SP. Done by ANDing the target SP with a constant to align it down, then +; doing the loop as normal. Note that we also re-align the stack in the prolog, +; which isn't actually needed because the only aligned allocations are dynamic, +; this is done even without stack probing. +define void @dynamic_align_64(i64 %size, ptr %out) #0 { +; CHECK-LABEL: dynamic_align_64: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: sub x9, sp, #32 +; CHECK-NEXT: and sp, x9, #0xffffffffffffffc0 +; CHECK-NEXT: add x9, x0, #15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: and x8, x8, #0xffffffffffffffc0 +; CHECK-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x8 +; CHECK-NEXT: b.le .LBB2_3 +; CHECK-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB2_1 +; CHECK-NEXT: .LBB2_3: +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: str x8, [x1] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 32 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w19 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret + %v = alloca i8, i64 %size, align 64 + store ptr %v, ptr %out, align 8 + ret void +} + +; Dynamic allocation, with an alignment greater than the stack guard size. The +; only difference to the dynamic allocation is the constant used for aligning +; the target SP, the loop will probe the whole allocation without needing to +; know about the alignment padding. +define void @dynamic_align_8192(i64 %size, ptr %out) #0 { +; CHECK-LABEL: dynamic_align_8192: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: sub x9, x9, #4064 +; CHECK-NEXT: and x9, x9, #0xffffffffffffe000 +; CHECK-NEXT: .LBB3_1: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB3_3 +; CHECK-NEXT: // %bb.2: // in Loop: Header=BB3_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB3_1 +; CHECK-NEXT: .LBB3_3: +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: add x9, x0, #15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: and x8, x8, #0xffffffffffffe000 +; CHECK-NEXT: .LBB3_4: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x8 +; CHECK-NEXT: b.le .LBB3_6 +; CHECK-NEXT: // %bb.5: // in Loop: Header=BB3_4 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB3_4 +; CHECK-NEXT: .LBB3_6: +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: str x8, [x1] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 32 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w19 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret + %v = alloca i8, i64 %size, align 8192 + store ptr %v, ptr %out, align 8 + ret void +} + +; For 64k guard pages, the only difference is the constant subtracted from SP +; in the loop. +define void @dynamic_64k_guard(i64 %size, ptr %out) #0 "stack-probe-size"="65536" { +; CHECK-LABEL: dynamic_64k_guard: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: add x9, x0, #15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: .LBB4_1: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: cmp sp, x8 +; CHECK-NEXT: b.le .LBB4_3 +; CHECK-NEXT: // %bb.2: // in Loop: Header=BB4_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB4_1 +; CHECK-NEXT: .LBB4_3: +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: str x8, [x1] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret + %v = alloca i8, i64 %size, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; If a function has variable-sized stack objects, then any function calls which +; need to pass arguments on the stack must allocate the stack space for them +; dynamically, to ensure they are at the bottom of the frame. We need to probe +; that space when it is larger than the unprobed space allowed by the ABI (1024 +; bytes), so this needs a very large number of arguments. +define void @no_reserved_call_frame(i64 %n) #0 { +; CHECK-LABEL: no_reserved_call_frame: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: lsl x9, x0, #2 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: add x9, x9, #15 +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x0, x8, x9 +; CHECK-NEXT: .LBB5_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x0 +; CHECK-NEXT: b.le .LBB5_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB5_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB5_1 +; CHECK-NEXT: .LBB5_3: // %entry +; CHECK-NEXT: mov sp, x0 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1104 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: bl callee_stack_args +; CHECK-NEXT: add sp, sp, #1104 +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i32, i64 %n + call void @callee_stack_args(ptr %v, [138 x i64] undef) + ret void +} + +; Same as above but without a variable-sized allocation, so the reserved call +; frame can be folded into the fixed-size allocation in the prologue. +define void @reserved_call_frame(i64 %n) #0 { +; CHECK-LABEL: reserved_call_frame: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: str x28, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w28, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: sub sp, sp, #1504 +; CHECK-NEXT: add x0, sp, #1104 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: bl callee_stack_args +; CHECK-NEXT: add sp, sp, #1504 +; CHECK-NEXT: .cfi_def_cfa wsp, 32 +; CHECK-NEXT: ldr x28, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w28 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i32, i64 100 + call void @callee_stack_args(ptr %v, [138 x i64] undef) + ret void +} + +declare void @callee_stack_args(ptr, [138 x i64]) + +; Dynamic allocation of SVE vectors +define void @dynamic_sve(i64 %size, ptr %out) #0 "target-features"="+sve" { +; CHECK-LABEL: dynamic_sve: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: rdvl x9, #1 +; CHECK-NEXT: mov x10, #15 // =0xf +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: madd x9, x0, x9, x10 +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x8 +; CHECK-NEXT: b.le .LBB7_3 +; CHECK-NEXT: // %bb.2: // in Loop: Header=BB7_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB7_1 +; CHECK-NEXT: .LBB7_3: +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: str x8, [x1] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 32 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w19 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret + %v = alloca , i64 %size, align 16 + store ptr %v, ptr %out, align 8 + ret void +} + +attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" }