Skip to content

Commit

Permalink
[AArch64] Stack probing for dynamic allocas in SelectionDAG (#66525)
Browse files Browse the repository at this point in the history
Add support for probing for dynamic allocas (variable-size objects and
outgoing stack arguments).

Co-authored-by: Oliver Stannard <[email protected]>
  • Loading branch information
momchil-velikov and ostannard authored Dec 2, 2023
1 parent a60a542 commit b1806e6
Show file tree
Hide file tree
Showing 6 changed files with 528 additions and 57 deletions.
28 changes: 26 additions & 2 deletions llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,11 @@ bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
/// included as part of the stack frame.
bool
AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
// The stack probing code for the dynamically allocated outgoing arguments
// area assumes that the stack is probed at the top - either by the prologue
// code, which issues a probe if `hasVarSizedObjects` return true, or by the
// most recent variable-sized object allocation. Changing the condition here
// may need to be followed up by changes to the probe issuing logic.
return !MF.getFrameInfo().hasVarSizedObjects();
}

Expand All @@ -487,6 +492,9 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
MachineBasicBlock::iterator I) const {
const AArch64InstrInfo *TII =
static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
const AArch64TargetLowering *TLI =
MF.getSubtarget<AArch64Subtarget>().getTargetLowering();
MachineFrameInfo &MFI = MF.getFrameInfo();
DebugLoc DL = I->getDebugLoc();
unsigned Opc = I->getOpcode();
bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
Expand All @@ -513,8 +521,24 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
// Most call frames will be allocated at the start of a function so
// this is OK, but it is a limitation that needs dealing with.
assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
StackOffset::getFixed(Amount), TII);

if (TLI->hasInlineStackProbe(MF) &&
-Amount >= AArch64::StackProbeMaxUnprobedStack) {
// When stack probing is enabled, the decrement of SP may need to be
// probed. We only need to do this if the call site needs 1024 bytes of
// space or more, because a region smaller than that is allowed to be
// unprobed at an ABI boundary. We rely on the fact that SP has been
// probed exactly at this point, either by the prologue or most recent
// dynamic allocation.
assert(MFI.hasVarSizedObjects() &&
"non-reserved call frame without var sized objects?");
Register ScratchReg =
MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
inlineStackProbeFixed(I, ScratchReg, -Amount, StackOffset::get(0, 0));
} else {
emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
StackOffset::getFixed(Amount), TII);
}
}
} else if (CalleePopAmount != 0) {
// If the calling convention demands that the callee pops arguments from the
Expand Down
154 changes: 102 additions & 52 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -569,10 +569,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FSHL, MVT::i32, Custom);
setOperationAction(ISD::FSHL, MVT::i64, Custom);

if (Subtarget->isTargetWindows())
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
else
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);

// Constant pool entries
setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
Expand Down Expand Up @@ -2353,6 +2350,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(AArch64ISD::CSINC)
MAKE_CASE(AArch64ISD::THREAD_POINTER)
MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
MAKE_CASE(AArch64ISD::PROBED_ALLOCA)
MAKE_CASE(AArch64ISD::ABDS_PRED)
MAKE_CASE(AArch64ISD::ABDU_PRED)
MAKE_CASE(AArch64ISD::HADDS_PRED)
Expand Down Expand Up @@ -2719,6 +2717,22 @@ MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
return BB;
}

MachineBasicBlock *
AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI,
MachineBasicBlock *MBB) const {
MachineFunction &MF = *MBB->getParent();
MachineBasicBlock::iterator MBBI = MI.getIterator();
DebugLoc DL = MBB->findDebugLoc(MBBI);
const AArch64InstrInfo &TII =
*MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
Register TargetReg = MI.getOperand(0).getReg();
MachineBasicBlock::iterator NextInst =
TII.probedStackAlloc(MBBI, TargetReg, false);

MI.eraseFromParent();
return NextInst->getParent();
}

MachineBasicBlock *
AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
MachineInstr &MI,
Expand Down Expand Up @@ -2863,6 +2877,10 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(

case AArch64::CATCHRET:
return EmitLoweredCatchRet(MI, BB);

case AArch64::PROBED_STACKALLOC_DYN:
return EmitDynamicProbedAlloc(MI, BB);

case AArch64::LD1_MXIPXX_H_PSEUDO_B:
return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
case AArch64::LD1_MXIPXX_H_PSEUDO_H:
Expand Down Expand Up @@ -14052,9 +14070,34 @@ SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
AN->getMemOperand());
}

SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const {
SDValue
AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {

SDLoc dl(Op);
// Get the inputs.
SDNode *Node = Op.getNode();
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);
MaybeAlign Align =
cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
EVT VT = Node->getValueType(0);

if (DAG.getMachineFunction().getFunction().hasFnAttribute(
"no-stack-arg-probe")) {
SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
Chain = SP.getValue(1);
SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
if (Align)
SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
SDValue Ops[2] = {SP, Chain};
return DAG.getMergeValues(Ops, dl);
}

Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Callee = DAG.getTargetExternalSymbol(Subtarget->getChkStkName(),
PtrVT, 0);
Expand All @@ -14078,7 +14121,59 @@ SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(

Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
DAG.getConstant(4, dl, MVT::i64));
return Chain;

SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
Chain = SP.getValue(1);
SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
if (Align)
SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);

Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);

SDValue Ops[2] = {SP, Chain};
return DAG.getMergeValues(Ops, dl);
}

SDValue
AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
// Get the inputs.
SDNode *Node = Op.getNode();
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);

MaybeAlign Align =
cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
SDLoc dl(Op);
EVT VT = Node->getValueType(0);

// Construct the new SP value in a GPR.
SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
Chain = SP.getValue(1);
SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
if (Align)
SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
DAG.getConstant(-(uint64_t)Align->value(), dl, VT));

// Set the real SP to the new value with a probing loop.
Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
SDValue Ops[2] = {SP, Chain};
return DAG.getMergeValues(Ops, dl);
}

SDValue
AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();

if (Subtarget->isTargetWindows())
return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
else if (hasInlineStackProbe(MF))
return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
else
return SDValue();
}

// When x and y are extended, lower:
Expand Down Expand Up @@ -14132,51 +14227,6 @@ SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
return DAG.getNode(ISD::ADD, dl, VT, Add, tmp);
}

SDValue
AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
assert(Subtarget->isTargetWindows() &&
"Only Windows alloca probing supported");
SDLoc dl(Op);
// Get the inputs.
SDNode *Node = Op.getNode();
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);
MaybeAlign Align =
cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
EVT VT = Node->getValueType(0);

if (DAG.getMachineFunction().getFunction().hasFnAttribute(
"no-stack-arg-probe")) {
SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
Chain = SP.getValue(1);
SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
if (Align)
SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
SDValue Ops[2] = {SP, Chain};
return DAG.getMergeValues(Ops, dl);
}

Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG);

SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
Chain = SP.getValue(1);
SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
if (Align)
SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);

Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);

SDValue Ops[2] = {SP, Chain};
return DAG.getMergeValues(Ops, dl);
}

SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
Expand Down
13 changes: 10 additions & 3 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@ enum NodeType : unsigned {
ADC,
SBC, // adc, sbc instructions

// To avoid stack clash, allocation is performed by block and each block is
// probed.
PROBED_ALLOCA,

// Predicated instructions where inactive lanes produce undefined results.
ABDS_PRED,
ABDU_PRED,
Expand Down Expand Up @@ -616,6 +620,9 @@ class AArch64TargetLowering : public TargetLowering {
MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
MachineBasicBlock *BB) const;

MachineBasicBlock *EmitDynamicProbedAlloc(MachineInstr &MI,
MachineBasicBlock *MBB) const;

MachineBasicBlock *EmitTileLoad(unsigned Opc, unsigned BaseReg,
MachineInstr &MI,
MachineBasicBlock *BB) const;
Expand Down Expand Up @@ -1141,10 +1148,10 @@ class AArch64TargetLowering : public TargetLowering {
SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain,
SDValue &Size,
SelectionDAG &DAG) const;

SDValue LowerAVG(SDValue Op, SelectionDAG &DAG, unsigned NewOp) const;

SDValue LowerFixedLengthVectorIntDivideToSVE(SDValue Op,
Expand Down
14 changes: 14 additions & 0 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -863,6 +863,12 @@ def AArch64stilp : SDNode<"AArch64ISD::STILP", SDT_AArch64stilp, [SDNPHasChain,
def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;

def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>;

def AArch64probedalloca
: SDNode<"AArch64ISD::PROBED_ALLOCA",
SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
[SDNPHasChain, SDNPMayStore]>;

def AArch64mrs : SDNode<"AArch64ISD::MRS",
SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, i32>]>,
[SDNPHasChain, SDNPOutGlue]>;
Expand Down Expand Up @@ -963,6 +969,14 @@ def PROBED_STACKALLOC_VAR : Pseudo<(outs),
[]>,
Sched<[]>;

// Probed stack allocations of a variable size, used for allocas of unknown size
// when stack-clash protection is enabled.
let usesCustomInserter = 1 in
def PROBED_STACKALLOC_DYN : Pseudo<(outs),
(ins GPR64common:$target),
[(AArch64probedalloca GPR64common:$target)]>,
Sched<[]>;

} // Defs = [SP, NZCV], Uses = [SP] in
} // hasSideEffects = 1, isCodeGenOnly = 1

Expand Down
14 changes: 14 additions & 0 deletions llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
; RUN: llc --stop-after=finalize-isel -o - | FileCheck %s
target triple = "aarch64-linux"

; Check dynamic stack allocation and probing instructions do not have
; the FrameSetup flag.

; CHECK-NOT: frame-setup
define void @no_frame_setup(i64 %size, ptr %out) #0 {
%v = alloca i8, i64 %size, align 1
store ptr %v, ptr %out, align 8
ret void
}

attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" }
Loading

0 comments on commit b1806e6

Please sign in to comment.