forked from OSchip/llvm-project
[X86] Emit fewer instructions to allocate >16GB stack frames
Summary: Use this code pattern when RAX is live, instead of emitting up to 2 billion adjustments: pushq %rax movabsq +-$Offset+-8, %rax addq %rsp, %rax xchg %rax, (%rsp) movq (%rsp), %rsp Try to clean this code up a bit while I'm here. In particular, hoist the logic that handles the entire adjustment with `movabsq $imm, %rax` out of the loop. This negates the offset in the prologue and uses ADD because X86 only has a two operand subtract which always subtracts from the destination register, which can no longer be RSP. Fixes PR31962 Reviewers: majnemer, sdardis Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D30052 llvm-svn: 298116
This commit is contained in:
parent
661ffe104e
commit
edf1cbb580
|
@ -252,40 +252,76 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
|
|||
int64_t NumBytes, bool InEpilogue) const {
|
||||
bool isSub = NumBytes < 0;
|
||||
uint64_t Offset = isSub ? -NumBytes : NumBytes;
|
||||
MachineInstr::MIFlag Flag =
|
||||
isSub ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy;
|
||||
|
||||
uint64_t Chunk = (1LL << 31) - 1;
|
||||
DebugLoc DL = MBB.findDebugLoc(MBBI);
|
||||
|
||||
while (Offset) {
|
||||
if (Offset > Chunk) {
|
||||
// Rather than emit a long series of instructions for large offsets,
|
||||
// load the offset into a register and do one sub/add
|
||||
unsigned Reg = 0;
|
||||
if (Offset > Chunk) {
|
||||
// Rather than emit a long series of instructions for large offsets,
|
||||
// load the offset into a register and do one sub/add
|
||||
unsigned Reg = 0;
|
||||
unsigned Rax = (unsigned)(Is64Bit ? X86::RAX : X86::EAX);
|
||||
|
||||
if (isSub && !isEAXLiveIn(MBB))
|
||||
Reg = (unsigned)(Is64Bit ? X86::RAX : X86::EAX);
|
||||
if (isSub && !isEAXLiveIn(MBB))
|
||||
Reg = Rax;
|
||||
else
|
||||
Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
|
||||
|
||||
unsigned MovRIOpc = Is64Bit ? X86::MOV64ri : X86::MOV32ri;
|
||||
unsigned AddSubRROpc =
|
||||
isSub ? getSUBrrOpcode(Is64Bit) : getADDrrOpcode(Is64Bit);
|
||||
if (Reg) {
|
||||
BuildMI(MBB, MBBI, DL, TII.get(MovRIOpc), Reg)
|
||||
.addImm(Offset)
|
||||
.setMIFlag(Flag);
|
||||
MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AddSubRROpc), StackPtr)
|
||||
.addReg(StackPtr)
|
||||
.addReg(Reg);
|
||||
MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
|
||||
return;
|
||||
} else if (Offset > 8 * Chunk) {
|
||||
// If we would need more than 8 add or sub instructions (a >16GB stack
|
||||
// frame), it's worth spilling RAX to materialize this immediate.
|
||||
// pushq %rax
|
||||
// movabsq +-$Offset+-SlotSize, %rax
|
||||
// addq %rsp, %rax
|
||||
// xchg %rax, (%rsp)
|
||||
// movq (%rsp), %rsp
|
||||
assert(Is64Bit && "can't have 32-bit 16GB stack frame");
|
||||
BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))
|
||||
.addReg(Rax, RegState::Kill)
|
||||
.setMIFlag(Flag);
|
||||
// Subtract is not commutative, so negate the offset and always use add.
|
||||
// Subtract 8 less and add 8 more to account for the PUSH we just did.
|
||||
if (isSub)
|
||||
Offset = -(Offset - SlotSize);
|
||||
else
|
||||
Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
|
||||
|
||||
if (Reg) {
|
||||
unsigned Opc = Is64Bit ? X86::MOV64ri : X86::MOV32ri;
|
||||
BuildMI(MBB, MBBI, DL, TII.get(Opc), Reg)
|
||||
.addImm(Offset);
|
||||
Opc = isSub
|
||||
? getSUBrrOpcode(Is64Bit)
|
||||
: getADDrrOpcode(Is64Bit);
|
||||
MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
|
||||
.addReg(StackPtr)
|
||||
.addReg(Reg);
|
||||
MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
|
||||
Offset = 0;
|
||||
continue;
|
||||
}
|
||||
Offset = Offset + SlotSize;
|
||||
BuildMI(MBB, MBBI, DL, TII.get(MovRIOpc), Rax)
|
||||
.addImm(Offset)
|
||||
.setMIFlag(Flag);
|
||||
MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(X86::ADD64rr), Rax)
|
||||
.addReg(Rax)
|
||||
.addReg(StackPtr);
|
||||
MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
|
||||
// Exchange the new SP in RAX with the top of the stack.
|
||||
addRegOffset(
|
||||
BuildMI(MBB, MBBI, DL, TII.get(X86::XCHG64rm), Rax).addReg(Rax),
|
||||
StackPtr, false, 0);
|
||||
// Load new SP from the top of the stack into RSP.
|
||||
addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), StackPtr),
|
||||
StackPtr, false, 0);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
while (Offset) {
|
||||
uint64_t ThisVal = std::min(Offset, Chunk);
|
||||
if (ThisVal == (Is64Bit ? 8 : 4)) {
|
||||
// Use push / pop instead.
|
||||
if (ThisVal == SlotSize) {
|
||||
// Use push / pop for slot sized adjustments as a size optimization. We
|
||||
// need to find a dead register when using pop.
|
||||
unsigned Reg = isSub
|
||||
? (unsigned)(Is64Bit ? X86::RAX : X86::EAX)
|
||||
: findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
|
||||
|
@ -293,23 +329,16 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
|
|||
unsigned Opc = isSub
|
||||
? (Is64Bit ? X86::PUSH64r : X86::PUSH32r)
|
||||
: (Is64Bit ? X86::POP64r : X86::POP32r);
|
||||
MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc))
|
||||
.addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub));
|
||||
if (isSub)
|
||||
MI->setFlag(MachineInstr::FrameSetup);
|
||||
else
|
||||
MI->setFlag(MachineInstr::FrameDestroy);
|
||||
BuildMI(MBB, MBBI, DL, TII.get(Opc))
|
||||
.addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub))
|
||||
.setMIFlag(Flag);
|
||||
Offset -= ThisVal;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
MachineInstrBuilder MI = BuildStackAdjustment(
|
||||
MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue);
|
||||
if (isSub)
|
||||
MI.setMIFlag(MachineInstr::FrameSetup);
|
||||
else
|
||||
MI.setMIFlag(MachineInstr::FrameDestroy);
|
||||
BuildStackAdjustment(MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue)
|
||||
.setMIFlag(Flag);
|
||||
|
||||
Offset -= ThisVal;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,62 @@
|
|||
; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefix=CHECK
|
||||
|
||||
; Test how we handle pathologically large stack frames when RAX is live through
|
||||
; the prologue and epilogue.
|
||||
|
||||
declare void @bar(i8*)
|
||||
declare void @llvm.va_start(i8*)
|
||||
|
||||
; For stack frames between 2GB and 16GB, do multiple adjustments.
|
||||
|
||||
define i32 @stack_frame_8gb(i32 %x, ...) nounwind {
|
||||
; CHECK-LABEL: stack_frame_8gb:
|
||||
; CHECK: subq ${{.*}}, %rsp # imm = 0x7FFFFFFF
|
||||
; CHECK: subq ${{.*}}, %rsp # imm = 0x7FFFFFFF
|
||||
; CHECK: subq ${{.*}}, %rsp # imm = 0x7FFFFFFF
|
||||
; CHECK: subq ${{.*}}, %rsp # imm = 0x7FFFFFFF
|
||||
; CHECK: subq ${{.*}}, %rsp
|
||||
; CHECK: callq bar
|
||||
; CHECK: addq ${{.*}}, %rsp # imm = 0x7FFFFFFF
|
||||
; CHECK: addq ${{.*}}, %rsp # imm = 0x7FFFFFFF
|
||||
; CHECK: addq ${{.*}}, %rsp # imm = 0x7FFFFFFF
|
||||
; CHECK: addq ${{.*}}, %rsp # imm = 0x7FFFFFFF
|
||||
; CHECK: addq ${{.*}}, %rsp
|
||||
; CHECK: retq
|
||||
%1 = alloca [u0x200000000 x i8]
|
||||
%va = alloca i8, i32 24
|
||||
call void @llvm.va_start(i8* %va)
|
||||
%2 = getelementptr inbounds [u0x200000000 x i8], [u0x200000000 x i8]* %1, i32 0, i32 0
|
||||
call void @bar(i8* %2)
|
||||
ret i32 %x
|
||||
}
|
||||
|
||||
; For stack frames larger than 16GB, spill EAX instead of doing a linear number
|
||||
; of adjustments.
|
||||
|
||||
; This function should have a frame size of 0x4000000D0. The 0xD0 is 208 bytes
|
||||
; from 24 bytes of va_list, 176 bytes of spilled varargs regparms, and 8 bytes
|
||||
; of alignment. We subtract 8 less and add 8 more in the prologue and epilogue
|
||||
; respectively to account for the PUSH.
|
||||
|
||||
define i32 @stack_frame_16gb(i32 %x, ...) nounwind {
|
||||
; CHECK-LABEL: stack_frame_16gb:
|
||||
; CHECK: pushq %rax
|
||||
; CHECK-NEXT: movabsq ${{.*}}, %rax # imm = 0xFFFFFFFBFFFFFF38
|
||||
; CHECK-NEXT: addq %rsp, %rax
|
||||
; CHECK-NEXT: xchgq %rax, (%rsp)
|
||||
; CHECK-NEXT: movq (%rsp), %rsp
|
||||
; CHECK: callq bar
|
||||
; CHECK: pushq %rax
|
||||
; CHECK-NEXT: movabsq ${{.*}}, %rax # imm = 0x4000000D8
|
||||
; CHECK-NEXT: addq %rsp, %rax
|
||||
; CHECK-NEXT: xchgq %rax, (%rsp)
|
||||
; CHECK-NEXT: movq (%rsp), %rsp
|
||||
; CHECK: retq
|
||||
%1 = alloca [u0x400000000 x i8]
|
||||
%va = alloca i8, i32 24
|
||||
call void @llvm.va_start(i8* %va)
|
||||
%2 = getelementptr inbounds [u0x400000000 x i8], [u0x400000000 x i8]* %1, i32 0, i32 0
|
||||
call void @bar(i8* %2)
|
||||
ret i32 %x
|
||||
}
|
||||
|
Loading…
Reference in New Issue