From 03828e38c3cf6f1b8f548b724bcf46e0cffeeac8 Mon Sep 17 00:00:00 2001 From: Kai Luo Date: Fri, 3 Jul 2020 05:27:25 +0000 Subject: [PATCH] [PowerPC] Implement probing for dynamic stack allocation This patch is part of supporting `-fstack-clash-protection`. Mainly do such things compared to existing `lowerDynamicAlloc` - Added a new pseudo instruction PPC::PREPARE_PROBED_ALLOC to get actual frame pointer and final stack pointer. - Synthesize a loop to probe by blocks. - Use DYNAREAOFFSET to get MaxCallFrameSize which is calculated in prologepilog. Differential Revision: https://reviews.llvm.org/D81358 --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 187 +++++++- llvm/lib/Target/PowerPC/PPCISelLowering.h | 11 + llvm/lib/Target/PowerPC/PPCInstr64Bit.td | 10 + llvm/lib/Target/PowerPC/PPCInstrInfo.td | 11 + llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp | 39 ++ llvm/lib/Target/PowerPC/PPCRegisterInfo.h | 1 + .../PowerPC/stack-clash-dynamic-alloca.ll | 437 ++++++++++++++++++ 7 files changed, 695 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/PowerPC/stack-clash-dynamic-alloca.ll diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 6083a0d26dd8..a5b9c9b60e6b 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -126,6 +126,7 @@ cl::desc("use absolute jump tables on ppc"), cl::Hidden); STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumSiblingCalls, "Number of sibling calls"); STATISTIC(ShufflesHandledWithVPERM, "Number of shuffles lowered to a VPERM"); +STATISTIC(NumDynamicAllocaProbed, "Number of dynamic stack allocation probed"); static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int); @@ -1486,6 +1487,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::ATOMIC_CMP_SWAP_16: return "PPCISD::ATOMIC_CMP_SWAP_16"; case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC"; case PPCISD::DYNAREAOFFSET: return "PPCISD::DYNAREAOFFSET"; + case PPCISD::PROBED_ALLOCA: return "PPCISD::PROBED_ALLOCA"; case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg"; case PPCISD::SRL: return "PPCISD::SRL"; case PPCISD::SRA: return "PPCISD::SRA"; @@ -7919,6 +7921,7 @@ PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const { SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); // Get the inputs. SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); @@ -7931,9 +7934,10 @@ SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, DAG.getConstant(0, dl, PtrVT), Size); // Construct a node for the frame pointer save index. SDValue FPSIdx = getFramePointerFrameIndex(DAG); - // Build a DYNALLOC node. SDValue Ops[3] = { Chain, NegSize, FPSIdx }; SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other); + if (hasInlineStackProbe(MF)) + return DAG.getNode(PPCISD::PROBED_ALLOCA, dl, VTs, Ops); return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops); } @@ -11799,6 +11803,184 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, return MBB; } +bool PPCTargetLowering::hasInlineStackProbe(MachineFunction &MF) const { + // If the function specifically requests inline stack probes, emit them. + if (MF.getFunction().hasFnAttribute("probe-stack")) + return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() == + "inline-asm"; + return false; +} + +unsigned PPCTargetLowering::getStackProbeSize(MachineFunction &MF) const { + const TargetFrameLowering *TFI = Subtarget.getFrameLowering(); + unsigned StackAlign = TFI->getStackAlignment(); + assert(StackAlign >= 1 && isPowerOf2_32(StackAlign) && + "Unexpected stack alignment"); + // The default stack probe size is 4096 if the function has no + // stack-probe-size attribute. + unsigned StackProbeSize = 4096; + const Function &Fn = MF.getFunction(); + if (Fn.hasFnAttribute("stack-probe-size")) + Fn.getFnAttribute("stack-probe-size") + .getValueAsString() + .getAsInteger(0, StackProbeSize); + // Round down to the stack alignment. + StackProbeSize &= ~(StackAlign - 1); + return StackProbeSize ? StackProbeSize : StackAlign; +} + +// Lower dynamic stack allocation with probing. `emitProbedAlloca` is splitted +// into three phases. In the first phase, it uses pseudo instruction +// PREPARE_PROBED_ALLOCA to get the future result of actual FramePointer and +// FinalStackPtr. In the second phase, it generates a loop for probing blocks. +// At last, it uses pseudo instruction DYNAREAOFFSET to get the future result of +// MaxCallFrameSize so that it can calculate correct data area pointer. +MachineBasicBlock * +PPCTargetLowering::emitProbedAlloca(MachineInstr &MI, + MachineBasicBlock *MBB) const { + const bool isPPC64 = Subtarget.isPPC64(); + MachineFunction *MF = MBB->getParent(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); + const unsigned ProbeSize = getStackProbeSize(*MF); + const BasicBlock *ProbedBB = MBB->getBasicBlock(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + // The CFG of probing stack looks as + // +-----+ + // | MBB | + // +--+--+ + // | + // +----v----+ + // +--->+ TestMBB +---+ + // | +----+----+ | + // | | | + // | +-----v----+ | + // +---+ BlockMBB | | + // +----------+ | + // | + // +---------+ | + // | TailMBB +<--+ + // +---------+ + // In MBB, calculate previous frame pointer and final stack pointer. + // In TestMBB, test if sp is equal to final stack pointer, if so, jump to + // TailMBB. In BlockMBB, update the sp atomically and jump back to TestMBB. + // TailMBB is spliced via \p MI. + MachineBasicBlock *TestMBB = MF->CreateMachineBasicBlock(ProbedBB); + MachineBasicBlock *TailMBB = MF->CreateMachineBasicBlock(ProbedBB); + MachineBasicBlock *BlockMBB = MF->CreateMachineBasicBlock(ProbedBB); + + MachineFunction::iterator MBBIter = ++MBB->getIterator(); + MF->insert(MBBIter, TestMBB); + MF->insert(MBBIter, BlockMBB); + MF->insert(MBBIter, TailMBB); + + const TargetRegisterClass *G8RC = &PPC::G8RCRegClass; + const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; + + Register DstReg = MI.getOperand(0).getReg(); + Register NegSizeReg = MI.getOperand(1).getReg(); + Register SPReg = isPPC64 ? PPC::X1 : PPC::R1; + Register FinalStackPtr = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC); + Register FramePointer = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC); + + // Get the canonical FinalStackPtr like what + // PPCRegisterInfo::lowerDynamicAlloc does. + BuildMI(*MBB, {MI}, DL, + TII->get(isPPC64 ? PPC::PREPARE_PROBED_ALLOCA_64 + : PPC::PREPARE_PROBED_ALLOCA_32), + FramePointer) + .addDef(FinalStackPtr) + .addReg(NegSizeReg) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)); + + // Materialize a scratch register for update. + int64_t NegProbeSize = -(int64_t)ProbeSize; + assert(isInt<32>(NegProbeSize) && "Unhandled probe size!"); + Register ScratchReg = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC); + if (!isInt<16>(NegProbeSize)) { + Register TempReg = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC); + BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::LIS8 : PPC::LIS), TempReg) + .addImm(NegProbeSize >> 16); + BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::ORI8 : PPC::ORI), + ScratchReg) + .addReg(TempReg) + .addImm(NegProbeSize & 0xFFFF); + } else + BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::LI8 : PPC::LI), ScratchReg) + .addImm(NegProbeSize); + + { + // Probing leading residual part. + Register Div = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC); + BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::DIVD : PPC::DIVW), Div) + .addReg(NegSizeReg) + .addReg(ScratchReg); + Register Mul = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC); + BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::MULLD : PPC::MULLW), Mul) + .addReg(Div) + .addReg(ScratchReg); + Register NegMod = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC); + BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::SUBF8 : PPC::SUBF), NegMod) + .addReg(Mul) + .addReg(NegSizeReg); + BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::STDUX : PPC::STWUX), SPReg) + .addReg(FramePointer) + .addReg(SPReg) + .addReg(NegMod); + } + + { + // Remaining part should be multiple of ProbeSize. + Register CmpResult = MRI.createVirtualRegister(&PPC::CRRCRegClass); + BuildMI(TestMBB, DL, TII->get(isPPC64 ? PPC::CMPD : PPC::CMPW), CmpResult) + .addReg(SPReg) + .addReg(FinalStackPtr); + BuildMI(TestMBB, DL, TII->get(PPC::BCC)) + .addImm(PPC::PRED_EQ) + .addReg(CmpResult) + .addMBB(TailMBB); + TestMBB->addSuccessor(BlockMBB); + TestMBB->addSuccessor(TailMBB); + } + + { + // Touch the block. + // |P...|P...|P... + BuildMI(BlockMBB, DL, TII->get(isPPC64 ? PPC::STDUX : PPC::STWUX), SPReg) + .addReg(FramePointer) + .addReg(SPReg) + .addReg(ScratchReg); + BuildMI(BlockMBB, DL, TII->get(PPC::B)).addMBB(TestMBB); + BlockMBB->addSuccessor(TestMBB); + } + + // Calculation of MaxCallFrameSize is deferred to prologepilog, use + // DYNAREAOFFSET pseudo instruction to get the future result. + Register MaxCallFrameSizeReg = + MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC); + BuildMI(TailMBB, DL, + TII->get(isPPC64 ? PPC::DYNAREAOFFSET8 : PPC::DYNAREAOFFSET), + MaxCallFrameSizeReg) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)); + BuildMI(TailMBB, DL, TII->get(isPPC64 ? PPC::ADD8 : PPC::ADD4), DstReg) + .addReg(SPReg) + .addReg(MaxCallFrameSizeReg); + + // Splice instructions after MI to TailMBB. + TailMBB->splice(TailMBB->end(), MBB, + std::next(MachineBasicBlock::iterator(MI)), MBB->end()); + TailMBB->transferSuccessorsAndUpdatePHIs(MBB); + MBB->addSuccessor(TestMBB); + + // Delete the pseudo instruction. + MI.eraseFromParent(); + + ++NumDynamicAllocaProbed; + return TailMBB; +} + MachineBasicBlock * PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { @@ -12565,6 +12747,9 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, .addReg(NewFPSCRReg) .addImm(0) .addImm(0); + } else if (MI.getOpcode() == PPC::PROBED_ALLOCA_32 || + MI.getOpcode() == PPC::PROBED_ALLOCA_64) { + return emitProbedAlloca(MI, BB); } else { llvm_unreachable("Unexpected instr type to insert"); } diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index bd306faf069c..e47f6adcf373 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -138,6 +138,10 @@ namespace llvm { /// dynamic alloca. DYNAREAOFFSET, + /// To avoid stack clash, allocation is performed by block and each block is + /// probed. + PROBED_ALLOCA, + /// GlobalBaseReg - On Darwin, this node represents the result of the mflr /// at function entry, used for PIC code. GlobalBaseReg, @@ -804,6 +808,13 @@ namespace llvm { MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI, MachineBasicBlock *MBB) const; + MachineBasicBlock *emitProbedAlloca(MachineInstr &MI, + MachineBasicBlock *MBB) const; + + bool hasInlineStackProbe(MachineFunction &MF) const override; + + unsigned getStackProbeSize(MachineFunction &MF) const; + ConstraintType getConstraintType(StringRef Constraint) const override; /// Examine constraint string and operand type and determine a weight value. diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td index cc0c787c7b21..25001ad8ace3 100644 --- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -425,6 +425,16 @@ def DYNALLOC8 : PPCEmitTimePseudo<(outs g8rc:$result), (ins g8rc:$negsize, memri (PPCdynalloc i64:$negsize, iaddr:$fpsi))]>; def DYNAREAOFFSET8 : PPCEmitTimePseudo<(outs i64imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET8", [(set i64:$result, (PPCdynareaoffset iaddr:$fpsi))]>; +// Probed alloca to support stack clash protection. +let Defs = [X1], Uses = [X1], hasNoSchedulingInfo = 1 in { +def PROBED_ALLOCA_64 : PPCCustomInserterPseudo<(outs g8rc:$result), + (ins g8rc:$negsize, memri:$fpsi), "#PROBED_ALLOCA_64", + [(set i64:$result, + (PPCprobedalloca i64:$negsize, iaddr:$fpsi))]>; +def PREPARE_PROBED_ALLOCA_64 : PPCEmitTimePseudo<(outs g8rc:$fp, + g8rc:$sp), + (ins g8rc:$negsize, memri:$fpsi), "#PREPARE_PROBED_ALLOCA_64", []>; +} let hasSideEffects = 0 in { let Defs = [LR8] in { diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index 99aa99906ad0..9c986fc52b68 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -323,6 +323,7 @@ def SDTDynOp : SDTypeProfile<1, 2, []>; def SDTDynAreaOp : SDTypeProfile<1, 1, []>; def PPCdynalloc : SDNode<"PPCISD::DYNALLOC", SDTDynOp, [SDNPHasChain]>; def PPCdynareaoffset : SDNode<"PPCISD::DYNAREAOFFSET", SDTDynAreaOp, [SDNPHasChain]>; +def PPCprobedalloca : SDNode<"PPCISD::PROBED_ALLOCA", SDTDynOp, [SDNPHasChain]>; // PC Relative Specific Nodes def PPCmatpcreladdr : SDNode<"PPCISD::MAT_PCREL_ADDR", SDTIntUnaryOp, []>; @@ -1399,6 +1400,16 @@ def DYNALLOC : PPCEmitTimePseudo<(outs gprc:$result), (ins gprc:$negsize, memri: (PPCdynalloc i32:$negsize, iaddr:$fpsi))]>; def DYNAREAOFFSET : PPCEmitTimePseudo<(outs i32imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET", [(set i32:$result, (PPCdynareaoffset iaddr:$fpsi))]>; +// Probed alloca to support stack clash protection. +let Defs = [R1], Uses = [R1], hasNoSchedulingInfo = 1 in { +def PROBED_ALLOCA_32 : PPCCustomInserterPseudo<(outs gprc:$result), + (ins gprc:$negsize, memri:$fpsi), "#PROBED_ALLOCA_32", + [(set i32:$result, + (PPCprobedalloca i32:$negsize, iaddr:$fpsi))]>; +def PREPARE_PROBED_ALLOCA_32 : PPCEmitTimePseudo<(outs gprc:$fp, + gprc:$sp), + (ins gprc:$negsize, memri:$fpsi), "#PREPARE_PROBED_ALLOCA_32", []>; +} // SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after // instruction selection into a branch sequence. diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp index a82b04384711..35f5e1fbebcd 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -610,6 +610,38 @@ void PPCRegisterInfo::prepareDynamicAlloca(MachineBasicBlock::iterator II, } } +void PPCRegisterInfo::lowerPrepareProbedAlloca( + MachineBasicBlock::iterator II) const { + MachineInstr &MI = *II; + // Get the instruction's basic block. + MachineBasicBlock &MBB = *MI.getParent(); + // Get the basic block's function. + MachineFunction &MF = *MBB.getParent(); + const PPCSubtarget &Subtarget = MF.getSubtarget(); + // Get the instruction info. + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + // Determine whether 64-bit pointers are used. + bool LP64 = TM.isPPC64(); + DebugLoc dl = MI.getDebugLoc(); + Register FramePointer = MI.getOperand(0).getReg(); + Register FinalStackPtr = MI.getOperand(1).getReg(); + bool KillNegSizeReg = MI.getOperand(2).isKill(); + Register NegSizeReg = MI.getOperand(2).getReg(); + prepareDynamicAlloca(II, NegSizeReg, KillNegSizeReg, FramePointer); + if (LP64) { + BuildMI(MBB, II, dl, TII.get(PPC::ADD8), FinalStackPtr) + .addReg(PPC::X1) + .addReg(NegSizeReg, getKillRegState(KillNegSizeReg)); + + } else { + BuildMI(MBB, II, dl, TII.get(PPC::ADD4), FinalStackPtr) + .addReg(PPC::R1) + .addReg(NegSizeReg, getKillRegState(KillNegSizeReg)); + } + + MBB.erase(II); +} + void PPCRegisterInfo::lowerDynamicAreaOffset( MachineBasicBlock::iterator II) const { // Get the instruction. @@ -1050,6 +1082,13 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, return; } + if (FPSI && FrameIndex == FPSI && + (OpC == PPC::PREPARE_PROBED_ALLOCA_64 || + OpC == PPC::PREPARE_PROBED_ALLOCA_32)) { + lowerPrepareProbedAlloca(II); + return; + } + // Special case for pseudo-ops SPILL_CR and RESTORE_CR, etc. if (OpC == PPC::SPILL_CR) { lowerCRSpilling(II, FrameIndex); diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h index 884c7f09d498..61acd955e1cb 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h @@ -110,6 +110,7 @@ public: void prepareDynamicAlloca(MachineBasicBlock::iterator II, Register &NegSizeReg, bool &KillNegSizeReg, Register &FramePointer) const; + void lowerPrepareProbedAlloca(MachineBasicBlock::iterator II) const; void lowerCRSpilling(MachineBasicBlock::iterator II, unsigned FrameIndex) const; void lowerCRRestore(MachineBasicBlock::iterator II, diff --git a/llvm/test/CodeGen/PowerPC/stack-clash-dynamic-alloca.ll b/llvm/test/CodeGen/PowerPC/stack-clash-dynamic-alloca.ll new file mode 100644 index 000000000000..6c136e9a541c --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/stack-clash-dynamic-alloca.ll @@ -0,0 +1,437 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -ppc-asm-full-reg-names -verify-machineinstrs \ +; RUN: -mtriple=powerpc64le-linux-gnu < %s | FileCheck \ +; RUN: -check-prefix=CHECK-LE %s +; RUN: llc -ppc-asm-full-reg-names -verify-machineinstrs \ +; RUN: -mtriple=powerpc64le-linux-gnu -mcpu=pwr9 < %s | FileCheck \ +; RUN: -check-prefix=CHECK-P9-LE %s +; RUN: llc -ppc-asm-full-reg-names -verify-machineinstrs \ +; RUN: -mtriple=powerpc64-linux-gnu < %s | FileCheck \ +; RUN: -check-prefix=CHECK-BE %s +; RUN: llc -ppc-asm-full-reg-names -verify-machineinstrs \ +; RUN: -mtriple=powerpc-linux-gnu < %s | FileCheck \ +; RUN: -check-prefix=CHECK-32 %s + +define i32 @foo(i32 %n) local_unnamed_addr #0 "stack-probe-size"="32768" nounwind { +; CHECK-LE-LABEL: foo: +; CHECK-LE: # %bb.0: +; CHECK-LE-NEXT: std r31, -8(r1) +; CHECK-LE-NEXT: stdu r1, -48(r1) +; CHECK-LE-NEXT: rldic r3, r3, 2, 30 +; CHECK-LE-NEXT: li r6, -32768 +; CHECK-LE-NEXT: mr r31, r1 +; CHECK-LE-NEXT: addi r3, r3, 15 +; CHECK-LE-NEXT: addi r4, r31, 48 +; CHECK-LE-NEXT: rldicl r3, r3, 60, 4 +; CHECK-LE-NEXT: rldicl r3, r3, 4, 29 +; CHECK-LE-NEXT: neg r5, r3 +; CHECK-LE-NEXT: divd r7, r5, r6 +; CHECK-LE-NEXT: add r3, r1, r5 +; CHECK-LE-NEXT: mulld r6, r7, r6 +; CHECK-LE-NEXT: sub r5, r5, r6 +; CHECK-LE-NEXT: stdux r4, r1, r5 +; CHECK-LE-NEXT: cmpd r1, r3 +; CHECK-LE-NEXT: beq cr0, .LBB0_2 +; CHECK-LE-NEXT: .LBB0_1: +; CHECK-LE-NEXT: stdu r4, -32768(r1) +; CHECK-LE-NEXT: cmpd r1, r3 +; CHECK-LE-NEXT: bne cr0, .LBB0_1 +; CHECK-LE-NEXT: .LBB0_2: +; CHECK-LE-NEXT: li r4, 1 +; CHECK-LE-NEXT: addi r3, r1, 32 +; CHECK-LE-NEXT: stw r4, 4792(r3) +; CHECK-LE-NEXT: lwz r3, 0(r3) +; CHECK-LE-NEXT: ld r1, 0(r1) +; CHECK-LE-NEXT: ld r31, -8(r1) +; CHECK-LE-NEXT: blr +; +; CHECK-P9-LE-LABEL: foo: +; CHECK-P9-LE: # %bb.0: +; CHECK-P9-LE-NEXT: std r31, -8(r1) +; CHECK-P9-LE-NEXT: stdu r1, -48(r1) +; CHECK-P9-LE-NEXT: rldic r3, r3, 2, 30 +; CHECK-P9-LE-NEXT: addi r3, r3, 15 +; CHECK-P9-LE-NEXT: rldicl r3, r3, 60, 4 +; CHECK-P9-LE-NEXT: rldicl r3, r3, 4, 29 +; CHECK-P9-LE-NEXT: neg r5, r3 +; CHECK-P9-LE-NEXT: li r6, -32768 +; CHECK-P9-LE-NEXT: divd r7, r5, r6 +; CHECK-P9-LE-NEXT: mulld r6, r7, r6 +; CHECK-P9-LE-NEXT: mr r31, r1 +; CHECK-P9-LE-NEXT: addi r4, r31, 48 +; CHECK-P9-LE-NEXT: add r3, r1, r5 +; CHECK-P9-LE-NEXT: sub r5, r5, r6 +; CHECK-P9-LE-NEXT: stdux r4, r1, r5 +; CHECK-P9-LE-NEXT: cmpd r1, r3 +; CHECK-P9-LE-NEXT: beq cr0, .LBB0_2 +; CHECK-P9-LE-NEXT: .LBB0_1: +; CHECK-P9-LE-NEXT: stdu r4, -32768(r1) +; CHECK-P9-LE-NEXT: cmpd r1, r3 +; CHECK-P9-LE-NEXT: bne cr0, .LBB0_1 +; CHECK-P9-LE-NEXT: .LBB0_2: +; CHECK-P9-LE-NEXT: addi r3, r1, 32 +; CHECK-P9-LE-NEXT: li r4, 1 +; CHECK-P9-LE-NEXT: stw r4, 4792(r3) +; CHECK-P9-LE-NEXT: lwz r3, 0(r3) +; CHECK-P9-LE-NEXT: ld r1, 0(r1) +; CHECK-P9-LE-NEXT: ld r31, -8(r1) +; CHECK-P9-LE-NEXT: blr +; +; CHECK-BE-LABEL: foo: +; CHECK-BE: # %bb.0: +; CHECK-BE-NEXT: std r31, -8(r1) +; CHECK-BE-NEXT: stdu r1, -64(r1) +; CHECK-BE-NEXT: rldic r3, r3, 2, 30 +; CHECK-BE-NEXT: li r6, -32768 +; CHECK-BE-NEXT: addi r3, r3, 15 +; CHECK-BE-NEXT: rldicl r3, r3, 60, 4 +; CHECK-BE-NEXT: mr r31, r1 +; CHECK-BE-NEXT: rldicl r3, r3, 4, 29 +; CHECK-BE-NEXT: addi r4, r31, 64 +; CHECK-BE-NEXT: neg r5, r3 +; CHECK-BE-NEXT: divd r7, r5, r6 +; CHECK-BE-NEXT: add r3, r1, r5 +; CHECK-BE-NEXT: mulld r6, r7, r6 +; CHECK-BE-NEXT: sub r5, r5, r6 +; CHECK-BE-NEXT: stdux r4, r1, r5 +; CHECK-BE-NEXT: cmpd r1, r3 +; CHECK-BE-NEXT: beq cr0, .LBB0_2 +; CHECK-BE-NEXT: .LBB0_1: +; CHECK-BE-NEXT: stdu r4, -32768(r1) +; CHECK-BE-NEXT: cmpd r1, r3 +; CHECK-BE-NEXT: bne cr0, .LBB0_1 +; CHECK-BE-NEXT: .LBB0_2: +; CHECK-BE-NEXT: li r4, 1 +; CHECK-BE-NEXT: addi r3, r1, 48 +; CHECK-BE-NEXT: stw r4, 4792(r3) +; CHECK-BE-NEXT: lwz r3, 0(r3) +; CHECK-BE-NEXT: ld r1, 0(r1) +; CHECK-BE-NEXT: ld r31, -8(r1) +; CHECK-BE-NEXT: blr +; +; CHECK-32-LABEL: foo: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: stwu r1, -32(r1) +; CHECK-32-NEXT: slwi r3, r3, 2 +; CHECK-32-NEXT: addi r3, r3, 15 +; CHECK-32-NEXT: rlwinm r3, r3, 0, 0, 27 +; CHECK-32-NEXT: neg r5, r3 +; CHECK-32-NEXT: li r6, -32768 +; CHECK-32-NEXT: divw r7, r5, r6 +; CHECK-32-NEXT: stw r31, 28(r1) +; CHECK-32-NEXT: mr r31, r1 +; CHECK-32-NEXT: addi r4, r31, 32 +; CHECK-32-NEXT: add r3, r1, r5 +; CHECK-32-NEXT: mullw r6, r7, r6 +; CHECK-32-NEXT: sub r5, r5, r6 +; CHECK-32-NEXT: stwux r4, r1, r5 +; CHECK-32-NEXT: cmpw r1, r3 +; CHECK-32-NEXT: beq cr0, .LBB0_2 +; CHECK-32-NEXT: .LBB0_1: +; CHECK-32-NEXT: stwu r4, -32768(r1) +; CHECK-32-NEXT: cmpw r1, r3 +; CHECK-32-NEXT: bne cr0, .LBB0_1 +; CHECK-32-NEXT: .LBB0_2: +; CHECK-32-NEXT: li r4, 1 +; CHECK-32-NEXT: addi r3, r1, 16 +; CHECK-32-NEXT: stw r4, 4792(r3) +; CHECK-32-NEXT: lwz r3, 0(r3) +; CHECK-32-NEXT: lwz r31, 0(r1) +; CHECK-32-NEXT: lwz r0, -4(r31) +; CHECK-32-NEXT: mr r1, r31 +; CHECK-32-NEXT: mr r31, r0 +; CHECK-32-NEXT: blr + %a = alloca i32, i32 %n, align 16 + %b = getelementptr inbounds i32, i32* %a, i64 1198 + store volatile i32 1, i32* %b + %c = load volatile i32, i32* %a + ret i32 %c +} + +define i32 @bar(i32 %n) local_unnamed_addr #0 nounwind { +; CHECK-LE-LABEL: bar: +; CHECK-LE: # %bb.0: +; CHECK-LE-NEXT: std r31, -8(r1) +; CHECK-LE-NEXT: stdu r1, -48(r1) +; CHECK-LE-NEXT: rldic r4, r3, 2, 30 +; CHECK-LE-NEXT: li r7, -4096 +; CHECK-LE-NEXT: mr r31, r1 +; CHECK-LE-NEXT: addi r4, r4, 15 +; CHECK-LE-NEXT: addi r5, r31, 48 +; CHECK-LE-NEXT: rldicl r4, r4, 60, 4 +; CHECK-LE-NEXT: rldicl r4, r4, 4, 29 +; CHECK-LE-NEXT: neg r6, r4 +; CHECK-LE-NEXT: divd r8, r6, r7 +; CHECK-LE-NEXT: add r4, r1, r6 +; CHECK-LE-NEXT: mulld r7, r8, r7 +; CHECK-LE-NEXT: sub r6, r6, r7 +; CHECK-LE-NEXT: stdux r5, r1, r6 +; CHECK-LE-NEXT: cmpd r1, r4 +; CHECK-LE-NEXT: beq cr0, .LBB1_2 +; CHECK-LE-NEXT: .LBB1_1: +; CHECK-LE-NEXT: stdu r5, -4096(r1) +; CHECK-LE-NEXT: cmpd r1, r4 +; CHECK-LE-NEXT: bne cr0, .LBB1_1 +; CHECK-LE-NEXT: .LBB1_2: +; CHECK-LE-NEXT: extsw r3, r3 +; CHECK-LE-NEXT: li r5, 1 +; CHECK-LE-NEXT: addi r4, r1, 32 +; CHECK-LE-NEXT: sldi r3, r3, 2 +; CHECK-LE-NEXT: add r3, r4, r3 +; CHECK-LE-NEXT: stw r5, 4096(r3) +; CHECK-LE-NEXT: lwz r3, 0(r4) +; CHECK-LE-NEXT: ld r1, 0(r1) +; CHECK-LE-NEXT: ld r31, -8(r1) +; CHECK-LE-NEXT: blr +; +; CHECK-P9-LE-LABEL: bar: +; CHECK-P9-LE: # %bb.0: +; CHECK-P9-LE-NEXT: std r31, -8(r1) +; CHECK-P9-LE-NEXT: stdu r1, -48(r1) +; CHECK-P9-LE-NEXT: rldic r4, r3, 2, 30 +; CHECK-P9-LE-NEXT: addi r4, r4, 15 +; CHECK-P9-LE-NEXT: rldicl r4, r4, 60, 4 +; CHECK-P9-LE-NEXT: rldicl r4, r4, 4, 29 +; CHECK-P9-LE-NEXT: neg r6, r4 +; CHECK-P9-LE-NEXT: li r7, -4096 +; CHECK-P9-LE-NEXT: divd r8, r6, r7 +; CHECK-P9-LE-NEXT: mulld r7, r8, r7 +; CHECK-P9-LE-NEXT: mr r31, r1 +; CHECK-P9-LE-NEXT: addi r5, r31, 48 +; CHECK-P9-LE-NEXT: add r4, r1, r6 +; CHECK-P9-LE-NEXT: sub r6, r6, r7 +; CHECK-P9-LE-NEXT: stdux r5, r1, r6 +; CHECK-P9-LE-NEXT: cmpd r1, r4 +; CHECK-P9-LE-NEXT: beq cr0, .LBB1_2 +; CHECK-P9-LE-NEXT: .LBB1_1: +; CHECK-P9-LE-NEXT: stdu r5, -4096(r1) +; CHECK-P9-LE-NEXT: cmpd r1, r4 +; CHECK-P9-LE-NEXT: bne cr0, .LBB1_1 +; CHECK-P9-LE-NEXT: .LBB1_2: +; CHECK-P9-LE-NEXT: addi r4, r1, 32 +; CHECK-P9-LE-NEXT: extswsli r3, r3, 2 +; CHECK-P9-LE-NEXT: add r3, r4, r3 +; CHECK-P9-LE-NEXT: li r5, 1 +; CHECK-P9-LE-NEXT: stw r5, 4096(r3) +; CHECK-P9-LE-NEXT: lwz r3, 0(r4) +; CHECK-P9-LE-NEXT: ld r1, 0(r1) +; CHECK-P9-LE-NEXT: ld r31, -8(r1) +; CHECK-P9-LE-NEXT: blr +; +; CHECK-BE-LABEL: bar: +; CHECK-BE: # %bb.0: +; CHECK-BE-NEXT: std r31, -8(r1) +; CHECK-BE-NEXT: stdu r1, -64(r1) +; CHECK-BE-NEXT: rldic r4, r3, 2, 30 +; CHECK-BE-NEXT: li r7, -4096 +; CHECK-BE-NEXT: addi r4, r4, 15 +; CHECK-BE-NEXT: rldicl r4, r4, 60, 4 +; CHECK-BE-NEXT: mr r31, r1 +; CHECK-BE-NEXT: rldicl r4, r4, 4, 29 +; CHECK-BE-NEXT: addi r5, r31, 64 +; CHECK-BE-NEXT: neg r6, r4 +; CHECK-BE-NEXT: divd r8, r6, r7 +; CHECK-BE-NEXT: add r4, r1, r6 +; CHECK-BE-NEXT: mulld r7, r8, r7 +; CHECK-BE-NEXT: sub r6, r6, r7 +; CHECK-BE-NEXT: stdux r5, r1, r6 +; CHECK-BE-NEXT: cmpd r1, r4 +; CHECK-BE-NEXT: beq cr0, .LBB1_2 +; CHECK-BE-NEXT: .LBB1_1: +; CHECK-BE-NEXT: stdu r5, -4096(r1) +; CHECK-BE-NEXT: cmpd r1, r4 +; CHECK-BE-NEXT: bne cr0, .LBB1_1 +; CHECK-BE-NEXT: .LBB1_2: +; CHECK-BE-NEXT: extsw r3, r3 +; CHECK-BE-NEXT: addi r4, r1, 48 +; CHECK-BE-NEXT: sldi r3, r3, 2 +; CHECK-BE-NEXT: li r5, 1 +; CHECK-BE-NEXT: add r3, r4, r3 +; CHECK-BE-NEXT: stw r5, 4096(r3) +; CHECK-BE-NEXT: lwz r3, 0(r4) +; CHECK-BE-NEXT: ld r1, 0(r1) +; CHECK-BE-NEXT: ld r31, -8(r1) +; CHECK-BE-NEXT: blr +; +; CHECK-32-LABEL: bar: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: stwu r1, -32(r1) +; CHECK-32-NEXT: slwi r3, r3, 2 +; CHECK-32-NEXT: addi r4, r3, 15 +; CHECK-32-NEXT: rlwinm r4, r4, 0, 0, 27 +; CHECK-32-NEXT: neg r6, r4 +; CHECK-32-NEXT: li r7, -4096 +; CHECK-32-NEXT: divw r8, r6, r7 +; CHECK-32-NEXT: stw r31, 28(r1) +; CHECK-32-NEXT: mr r31, r1 +; CHECK-32-NEXT: addi r5, r31, 32 +; CHECK-32-NEXT: add r4, r1, r6 +; CHECK-32-NEXT: mullw r7, r8, r7 +; CHECK-32-NEXT: sub r6, r6, r7 +; CHECK-32-NEXT: stwux r5, r1, r6 +; CHECK-32-NEXT: cmpw r1, r4 +; CHECK-32-NEXT: beq cr0, .LBB1_2 +; CHECK-32-NEXT: .LBB1_1: +; CHECK-32-NEXT: stwu r5, -4096(r1) +; CHECK-32-NEXT: cmpw r1, r4 +; CHECK-32-NEXT: bne cr0, .LBB1_1 +; CHECK-32-NEXT: .LBB1_2: +; CHECK-32-NEXT: addi r4, r1, 16 +; CHECK-32-NEXT: li r5, 1 +; CHECK-32-NEXT: add r3, r4, r3 +; CHECK-32-NEXT: stw r5, 4096(r3) +; CHECK-32-NEXT: lwz r3, 0(r4) +; CHECK-32-NEXT: lwz r31, 0(r1) +; CHECK-32-NEXT: lwz r0, -4(r31) +; CHECK-32-NEXT: mr r1, r31 +; CHECK-32-NEXT: mr r31, r0 +; CHECK-32-NEXT: blr + %a = alloca i32, i32 %n, align 16 + %i = add i32 %n, 1024 + %b = getelementptr inbounds i32, i32* %a, i32 %i + store volatile i32 1, i32* %b + %c = load volatile i32, i32* %a + ret i32 %c +} + +define i32 @f(i32 %n) local_unnamed_addr #0 "stack-probe-size"="65536" nounwind { +; CHECK-LE-LABEL: f: +; CHECK-LE: # %bb.0: +; CHECK-LE-NEXT: std r31, -8(r1) +; CHECK-LE-NEXT: stdu r1, -48(r1) +; CHECK-LE-NEXT: rldic r3, r3, 2, 30 +; CHECK-LE-NEXT: lis r5, -1 +; CHECK-LE-NEXT: mr r31, r1 +; CHECK-LE-NEXT: addi r3, r3, 15 +; CHECK-LE-NEXT: ori r5, r5, 0 +; CHECK-LE-NEXT: addi r4, r31, 48 +; CHECK-LE-NEXT: rldicl r3, r3, 60, 4 +; CHECK-LE-NEXT: rldicl r3, r3, 4, 29 +; CHECK-LE-NEXT: neg r6, r3 +; CHECK-LE-NEXT: divd r7, r6, r5 +; CHECK-LE-NEXT: add r3, r1, r6 +; CHECK-LE-NEXT: mulld r7, r7, r5 +; CHECK-LE-NEXT: sub r6, r6, r7 +; CHECK-LE-NEXT: stdux r4, r1, r6 +; CHECK-LE-NEXT: cmpd r1, r3 +; CHECK-LE-NEXT: beq cr0, .LBB2_2 +; CHECK-LE-NEXT: .LBB2_1: +; CHECK-LE-NEXT: stdux r4, r1, r5 +; CHECK-LE-NEXT: cmpd r1, r3 +; CHECK-LE-NEXT: bne cr0, .LBB2_1 +; CHECK-LE-NEXT: .LBB2_2: +; CHECK-LE-NEXT: li r4, 1 +; CHECK-LE-NEXT: addi r3, r1, 32 +; CHECK-LE-NEXT: stw r4, 4792(r3) +; CHECK-LE-NEXT: lwz r3, 0(r3) +; CHECK-LE-NEXT: ld r1, 0(r1) +; CHECK-LE-NEXT: ld r31, -8(r1) +; CHECK-LE-NEXT: blr +; +; CHECK-P9-LE-LABEL: f: +; CHECK-P9-LE: # %bb.0: +; CHECK-P9-LE-NEXT: std r31, -8(r1) +; CHECK-P9-LE-NEXT: stdu r1, -48(r1) +; CHECK-P9-LE-NEXT: rldic r3, r3, 2, 30 +; CHECK-P9-LE-NEXT: addi r3, r3, 15 +; CHECK-P9-LE-NEXT: rldicl r3, r3, 60, 4 +; CHECK-P9-LE-NEXT: rldicl r3, r3, 4, 29 +; CHECK-P9-LE-NEXT: lis r5, -1 +; CHECK-P9-LE-NEXT: ori r5, r5, 0 +; CHECK-P9-LE-NEXT: neg r6, r3 +; CHECK-P9-LE-NEXT: divd r7, r6, r5 +; CHECK-P9-LE-NEXT: mulld r7, r7, r5 +; CHECK-P9-LE-NEXT: mr r31, r1 +; CHECK-P9-LE-NEXT: addi r4, r31, 48 +; CHECK-P9-LE-NEXT: add r3, r1, r6 +; CHECK-P9-LE-NEXT: sub r6, r6, r7 +; CHECK-P9-LE-NEXT: stdux r4, r1, r6 +; CHECK-P9-LE-NEXT: cmpd r1, r3 +; CHECK-P9-LE-NEXT: beq cr0, .LBB2_2 +; CHECK-P9-LE-NEXT: .LBB2_1: +; CHECK-P9-LE-NEXT: stdux r4, r1, r5 +; CHECK-P9-LE-NEXT: cmpd r1, r3 +; CHECK-P9-LE-NEXT: bne cr0, .LBB2_1 +; CHECK-P9-LE-NEXT: .LBB2_2: +; CHECK-P9-LE-NEXT: addi r3, r1, 32 +; CHECK-P9-LE-NEXT: li r4, 1 +; CHECK-P9-LE-NEXT: stw r4, 4792(r3) +; CHECK-P9-LE-NEXT: lwz r3, 0(r3) +; CHECK-P9-LE-NEXT: ld r1, 0(r1) +; CHECK-P9-LE-NEXT: ld r31, -8(r1) +; CHECK-P9-LE-NEXT: blr +; +; CHECK-BE-LABEL: f: +; CHECK-BE: # %bb.0: +; CHECK-BE-NEXT: std r31, -8(r1) +; CHECK-BE-NEXT: stdu r1, -64(r1) +; CHECK-BE-NEXT: rldic r3, r3, 2, 30 +; CHECK-BE-NEXT: lis r5, -1 +; CHECK-BE-NEXT: addi r3, r3, 15 +; CHECK-BE-NEXT: rldicl r3, r3, 60, 4 +; CHECK-BE-NEXT: ori r5, r5, 0 +; CHECK-BE-NEXT: rldicl r3, r3, 4, 29 +; CHECK-BE-NEXT: mr r31, r1 +; CHECK-BE-NEXT: neg r6, r3 +; CHECK-BE-NEXT: divd r7, r6, r5 +; CHECK-BE-NEXT: addi r4, r31, 64 +; CHECK-BE-NEXT: mulld r7, r7, r5 +; CHECK-BE-NEXT: add r3, r1, r6 +; CHECK-BE-NEXT: sub r6, r6, r7 +; CHECK-BE-NEXT: stdux r4, r1, r6 +; CHECK-BE-NEXT: cmpd r1, r3 +; CHECK-BE-NEXT: beq cr0, .LBB2_2 +; CHECK-BE-NEXT: .LBB2_1: +; CHECK-BE-NEXT: stdux r4, r1, r5 +; CHECK-BE-NEXT: cmpd r1, r3 +; CHECK-BE-NEXT: bne cr0, .LBB2_1 +; CHECK-BE-NEXT: .LBB2_2: +; CHECK-BE-NEXT: li r4, 1 +; CHECK-BE-NEXT: addi r3, r1, 48 +; CHECK-BE-NEXT: stw r4, 4792(r3) +; CHECK-BE-NEXT: lwz r3, 0(r3) +; CHECK-BE-NEXT: ld r1, 0(r1) +; CHECK-BE-NEXT: ld r31, -8(r1) +; CHECK-BE-NEXT: blr +; +; CHECK-32-LABEL: f: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: stwu r1, -32(r1) +; CHECK-32-NEXT: slwi r3, r3, 2 +; CHECK-32-NEXT: addi r3, r3, 15 +; CHECK-32-NEXT: rlwinm r3, r3, 0, 0, 27 +; CHECK-32-NEXT: lis r5, -1 +; CHECK-32-NEXT: neg r6, r3 +; CHECK-32-NEXT: ori r5, r5, 0 +; CHECK-32-NEXT: divw r7, r6, r5 +; CHECK-32-NEXT: stw r31, 28(r1) +; CHECK-32-NEXT: mr r31, r1 +; CHECK-32-NEXT: addi r4, r31, 32 +; CHECK-32-NEXT: add r3, r1, r6 +; CHECK-32-NEXT: mullw r7, r7, r5 +; CHECK-32-NEXT: sub r6, r6, r7 +; CHECK-32-NEXT: stwux r4, r1, r6 +; CHECK-32-NEXT: cmpw r1, r3 +; CHECK-32-NEXT: beq cr0, .LBB2_2 +; CHECK-32-NEXT: .LBB2_1: +; CHECK-32-NEXT: stwux r4, r1, r5 +; CHECK-32-NEXT: cmpw r1, r3 +; CHECK-32-NEXT: bne cr0, .LBB2_1 +; CHECK-32-NEXT: .LBB2_2: +; CHECK-32-NEXT: li r4, 1 +; CHECK-32-NEXT: addi r3, r1, 16 +; CHECK-32-NEXT: stw r4, 4792(r3) +; CHECK-32-NEXT: lwz r3, 0(r3) +; CHECK-32-NEXT: lwz r31, 0(r1) +; CHECK-32-NEXT: lwz r0, -4(r31) +; CHECK-32-NEXT: mr r1, r31 +; CHECK-32-NEXT: mr r31, r0 +; CHECK-32-NEXT: blr + %a = alloca i32, i32 %n, align 16 + %b = getelementptr inbounds i32, i32* %a, i64 1198 + store volatile i32 1, i32* %b + %c = load volatile i32, i32* %a + ret i32 %c +} + +attributes #0 = {"probe-stack"="inline-asm"}