Skip to content

Commit

Permalink
Fix for a missing probe to the SVE callee-saved registers area
Browse files Browse the repository at this point in the history
In some cases a probe may be elided, resulting in more than 1024
unprobed area at the top of the stack or decrement of the SP by more
than a guard area size.
  • Loading branch information
momchil-velikov committed Nov 30, 2023
1 parent 2278dd2 commit 94f8691
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 10 deletions.
20 changes: 12 additions & 8 deletions llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -699,7 +699,8 @@ static int64_t upperBound(StackOffset Size) {
void AArch64FrameLowering::allocateStackSpace(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
int64_t RealignmentPadding, StackOffset AllocSize, bool NeedsWinCFI,
bool *HasWinCFI, bool EmitCFI, StackOffset InitialOffset) const {
bool *HasWinCFI, bool EmitCFI, StackOffset InitialOffset,
bool FollowupAllocs) const {

if (!AllocSize)
return;
Expand Down Expand Up @@ -753,9 +754,10 @@ void AArch64FrameLowering::allocateStackSpace(
.addImm(InitialOffset.getFixed())
.addImm(InitialOffset.getScalable());
// The fixed allocation may leave unprobed bytes at the top of the
// stack. If we have variable-sized objects, we need to issue an extra
// probe, so their allocations starts in a known state.
if (MFI.hasVarSizedObjects()) {
// stack. If we have subsequent alocation (e.g. if we have variable-sized
// objects), we need to issue an extra probe, so these allocations start in
// a known state.
if (FollowupAllocs) {
// STR XZR, [SP]
BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui))
.addReg(AArch64::XZR)
Expand Down Expand Up @@ -789,8 +791,8 @@ void AArch64FrameLowering::allocateStackSpace(
.setMIFlags(MachineInstr::FrameSetup);
AFI.setStackRealigned(true);
}
if (MFI.hasVarSizedObjects() || upperBound(AllocSize) + RealignmentPadding >
AArch64::StackProbeMaxUnprobedStack) {
if (FollowupAllocs || upperBound(AllocSize) + RealignmentPadding >
AArch64::StackProbeMaxUnprobedStack) {
// STR XZR, [SP]
BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui))
.addReg(AArch64::XZR)
Expand Down Expand Up @@ -1986,8 +1988,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// Allocate space for the callee saves (if any).
StackOffset CFAOffset =
StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes);
StackOffset LocalsSize = SVELocalsSize + StackOffset::getFixed(NumBytes);
allocateStackSpace(MBB, CalleeSavesBegin, 0, SVECalleeSavesSize, false,
nullptr, EmitAsyncCFI && !HasFP, CFAOffset);
nullptr, EmitAsyncCFI && !HasFP, CFAOffset,
MFI.hasVarSizedObjects() || LocalsSize);
CFAOffset += SVECalleeSavesSize;

if (EmitAsyncCFI)
Expand All @@ -2004,7 +2008,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
allocateStackSpace(MBB, CalleeSavesEnd, RealignmentPadding,
SVELocalsSize + StackOffset::getFixed(NumBytes),
NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP,
CFAOffset);
CFAOffset, MFI.hasVarSizedObjects());
}

// If we need a base pointer, set it up here. It's whatever the value of the
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AArch64/AArch64FrameLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,8 @@ class AArch64FrameLowering : public TargetFrameLowering {
void allocateStackSpace(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
int64_t RealignmentPadding, StackOffset AllocSize,
bool NeedsWinCFI, bool *HasWinCFI,
bool EmitCFI, StackOffset InitialOffset) const;
bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI,
StackOffset InitialOffset, bool FollowupAllocs) const;

/// Emit target zero call-used regs.
void emitZeroCallUsedRegs(BitVector RegsToZero,
Expand Down
48 changes: 48 additions & 0 deletions llvm/test/CodeGen/AArch64/stack-probing-sve.ll
Original file line number Diff line number Diff line change
Expand Up @@ -671,4 +671,52 @@ entry:
ret void
}

; Test with a 14 scalable bytes (so up to 14 * 16 = 224) of unprobed
; are bellow the save location of `p9`.
define void @sve_unprobed_area(<vscale x 4 x float> %a, i32 %n) #0 {
; CHECK-LABEL: sve_unprobed_area:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: addvl sp, sp, #-4
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
; CHECK-NEXT: str xzr, [sp]
; CHECK-NEXT: str p9, [sp, #7, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: str z8, [sp, #3, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
; CHECK-NEXT: addvl sp, sp, #-4
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG
; CHECK-NEXT: //APP
; CHECK-NEXT: //NO_APP
; CHECK-NEXT: addvl sp, sp, #4
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
; CHECK-NEXT: ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #4
; CHECK-NEXT: .cfi_def_cfa wsp, 16
; CHECK-NEXT: .cfi_restore z8
; CHECK-NEXT: .cfi_restore z9
; CHECK-NEXT: .cfi_restore z10
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: .cfi_restore w29
; CHECK-NEXT: ret
entry:
call void asm sideeffect "", "~{z8},~{z9},~{z10},~{p9}" ()

%v0 = alloca <vscale x 4 x float>, align 16
%v1 = alloca <vscale x 4 x float>, align 16
%v2 = alloca <vscale x 4 x float>, align 16
%v3 = alloca <vscale x 4 x float>, align 16

ret void
}

attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" }

0 comments on commit 94f8691

Please sign in to comment.