From 53bc37ca2a9ce79e03bb72edc4ace81442300f39 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Mon, 18 Feb 2013 20:55:12 +0000 Subject: [PATCH] Support for HiPE-compatible code emission, patch by Yiannis Tsiouris. llvm-svn: 175457 --- .../include/llvm/Target/TargetFrameLowering.h | 4 + llvm/lib/CodeGen/PrologEpilogInserter.cpp | 8 + llvm/lib/Target/X86/X86FrameLowering.cpp | 160 +++++++++++++++++- llvm/lib/Target/X86/X86FrameLowering.h | 2 + llvm/test/CodeGen/X86/hipe-prologue.ll | 67 ++++++++ 5 files changed, 236 insertions(+), 5 deletions(-) create mode 100644 llvm/test/CodeGen/X86/hipe-prologue.ll diff --git a/llvm/include/llvm/Target/TargetFrameLowering.h b/llvm/include/llvm/Target/TargetFrameLowering.h index 1958f90f9b1b..ea279168f7c9 100644 --- a/llvm/include/llvm/Target/TargetFrameLowering.h +++ b/llvm/include/llvm/Target/TargetFrameLowering.h @@ -120,6 +120,10 @@ public: /// by adding a check even before the "normal" function prologue. virtual void adjustForSegmentedStacks(MachineFunction &MF) const { } + /// Adjust the prologue to add Erlang Run-Time System (ERTS) specific code in + /// the assembly prologue to explicitly handle the stack. + virtual void adjustForHiPEPrologue(MachineFunction &MF) const { } + /// spillCalleeSavedRegisters - Issues instruction(s) to spill all callee /// saved registers and returns true if it isn't possible / profitable to do /// so by issuing a series of store instructions via diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp index 954613d6f99a..45e04a9c627b 100644 --- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -693,6 +693,14 @@ void PEI::insertPrologEpilogCode(MachineFunction &Fn) { // space in small chunks instead of one large contiguous block. if (Fn.getTarget().Options.EnableSegmentedStacks) TFI.adjustForSegmentedStacks(Fn); + + // Emit additional code that is required to explicitly handle the stack in + // HiPE native code (if needed) when loaded in the Erlang/OTP runtime. The + // approach is rather similar to that of Segmented Stacks, but it uses a + // different conditional check and another BIF for allocating more stack + // space. + if (Fn.getFunction()->getCallingConv() == CallingConv::HiPE) + TFI.adjustForHiPEPrologue(Fn); } /// replaceFrameIndices - Replace all MO_FrameIndex operands with physical diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index 950fd39742b4..eb9f865085be 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -1387,16 +1387,25 @@ HasNestArgument(const MachineFunction *MF) { } -/// GetScratchRegister - Get a register for performing work in the segmented -/// stack prologue. Depending on platform and the properties of the function -/// either one or two registers will be needed. Set primary to true for -/// the first register, false for the second. +/// GetScratchRegister - Get a temp register for performing work in the +/// segmented stack and the Erlang/HiPE stack prologue. Depending on platform +/// and the properties of the function either one or two registers will be +/// needed. Set primary to true for the first register, false for the second. static unsigned GetScratchRegister(bool Is64Bit, const MachineFunction &MF, bool Primary) { + CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv(); + + // Erlang stuff. + if (CallingConvention == CallingConv::HiPE) { + if (Is64Bit) + return Primary ? X86::R14 : X86::R13; + else + return Primary ? X86::EBX : X86::EDI; + } + if (Is64Bit) return Primary ? X86::R11 : X86::R12; - CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv(); bool IsNested = HasNestArgument(&MF); if (CallingConvention == CallingConv::X86_FastCall || @@ -1603,3 +1612,144 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { MF.verify(); #endif } + +// Erlang programs may need a special prologue to handle the stack size they +// might need at runtime. That is because Erlang/OTP does not implement a C +// stack but uses a custom implementation of hybrid stack/heap +// architecture. (for more information see Eric Stenman's Ph.D. thesis: +// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf) +// +// +// CheckStack: +// temp0 = sp - MaxStack +// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart +// OldStart: +// ... +// IncStack: +// call inc_stack # doubles the stack space +// temp0 = sp - MaxStack +// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart +void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { + const X86InstrInfo &TII = *TM.getInstrInfo(); + const X86Subtarget *ST = &MF.getTarget().getSubtarget(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + const uint64_t SlotSize = TM.getRegisterInfo()->getSlotSize(); + const bool Is64Bit = STI.is64Bit(); + DebugLoc DL; + // HiPE-specific values + const unsigned HipeLeafWords = 24; + const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5; + const unsigned Guaranteed = HipeLeafWords * SlotSize; + const unsigned CallerStkArity = + std::max(0, MF.getFunction()->arg_size() - CCRegisteredArgs); + unsigned MaxStack = + MFI->getStackSize() + CallerStkArity * SlotSize + SlotSize; + + assert(ST->isTargetLinux() && + "HiPE prologue is only supported on Linux operating systems."); + + // Compute the largest caller's frame that is needed to fit the callees' + // frames. This 'MaxStack' is computed from: + // + // a) the fixed frame size, which is the space needed for all spilled temps, + // b) outgoing on-stack parameter areas, and + // c) the minimum stack space this function needs to make available for the + // functions it calls (a tunable ABI property). + if (MFI->hasCalls()) { + unsigned MoreStackForCalls = 0; + + for (MachineFunction::iterator MBBI = MF.begin(), MBBE = MF.end(); + MBBI != MBBE; ++MBBI) + for (MachineBasicBlock::iterator MI = MBBI->begin(), ME = MBBI->end(); + MI != ME; ++MI) + if (MI->isCall()) { + // Get callee operand. + const MachineOperand &MO = MI->getOperand(0); + const Function *F; + + // Only take account of global function calls (no closures etc.). + if (!MO.isGlobal()) continue; + if (!(F = dyn_cast(MO.getGlobal()))) continue; + + // Do not update 'MaxStack' for primitive and built-in functions + // (encoded with names either starting with "erlang."/"bif_" or not + // having a ".", such as a simple .., or an + // "_", such as the BIF "suspend_0") as they are executed on another + // stack. + if ((F->getName().find("erlang.") != std::string::npos) || + (F->getName().find("bif_") != std::string::npos)) continue; + if (F->getName().find_first_of("._") == std::string::npos) + continue; + + const uint64_t CalleeStkArity = + std::max(0, F->arg_size() - CCRegisteredArgs); + MoreStackForCalls = std::max( + MoreStackForCalls, (HipeLeafWords - 1 - CalleeStkArity) * SlotSize); + } + MaxStack += MoreStackForCalls; + } + + // If the stack frame needed is larger than the guaranteed then runtime checks + // and calls to "inc_stack_0" BIF should be inserted in the assembly prologue. + if (MaxStack > Guaranteed) { + MachineBasicBlock &prologueMBB = MF.front(); + MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock(); + MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock(); + + for (MachineBasicBlock::livein_iterator I = prologueMBB.livein_begin(), + E = prologueMBB.livein_end(); I != E; I++) { + stackCheckMBB->addLiveIn(*I); + incStackMBB->addLiveIn(*I); + } + + MF.push_front(incStackMBB); + MF.push_front(stackCheckMBB); + + unsigned ScratchReg, SPReg, PReg, SPLimitOffset; + unsigned LEAop, CMPop, CALLop; + if (Is64Bit) { + SPReg = X86::RSP; + PReg = X86::RBP; + LEAop = X86::LEA64r; + CMPop = X86::CMP64rm; + CALLop = X86::CALL64pcrel32; + SPLimitOffset = 0x90; + } else { + SPReg = X86::ESP; + PReg = X86::EBP; + LEAop = X86::LEA32r; + CMPop = X86::CMP32rm; + CALLop = X86::CALLpcrel32; + SPLimitOffset = 0x4c; + } + + ScratchReg = GetScratchRegister(Is64Bit, MF, true); + assert(!MF.getRegInfo().isLiveIn(ScratchReg) && + "HiPE prologue scratch register is live-in"); + + // Create new MBB for StackCheck: + addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(LEAop), ScratchReg), + SPReg, false, -MaxStack); + // SPLimitOffset is in a fixed heap location (pointed by BP). + addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop)) + .addReg(ScratchReg), PReg, false, SPLimitOffset); + BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_4)).addMBB(&prologueMBB); + + // Create new MBB for IncStack: + BuildMI(incStackMBB, DL, TII.get(CALLop)). + addExternalSymbol("inc_stack_0"); + addRegOffset(BuildMI(incStackMBB, DL, TII.get(LEAop), ScratchReg), + SPReg, false, -MaxStack); + addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop)) + .addReg(ScratchReg), PReg, false, SPLimitOffset); + BuildMI(incStackMBB, DL, TII.get(X86::JLE_4)).addMBB(incStackMBB); + + stackCheckMBB->addSuccessor(&prologueMBB, 99); + stackCheckMBB->addSuccessor(incStackMBB, 1); + incStackMBB->addSuccessor(&prologueMBB, 99); + incStackMBB->addSuccessor(incStackMBB, 1); + } +#ifdef XDEBUG + MF.verify(); +#endif +} diff --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h index dc515dc39c79..c35d9528b777 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.h +++ b/llvm/lib/Target/X86/X86FrameLowering.h @@ -43,6 +43,8 @@ public: void adjustForSegmentedStacks(MachineFunction &MF) const; + void adjustForHiPEPrologue(MachineFunction &MF) const; + void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, RegScavenger *RS = NULL) const; diff --git a/llvm/test/CodeGen/X86/hipe-prologue.ll b/llvm/test/CodeGen/X86/hipe-prologue.ll new file mode 100644 index 000000000000..ff3c5c803c90 --- /dev/null +++ b/llvm/test/CodeGen/X86/hipe-prologue.ll @@ -0,0 +1,67 @@ +; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -verify-machineinstrs | FileCheck %s -check-prefix=X32-Linux +; RUN: llc < %s -mtriple=x86_64-linux-gnu -verify-machineinstrs | FileCheck %s -check-prefix=X64-Linux + +; The HiPE compiler (i.e., the native code compiler of the Erlang/OTP system) +; adds a custom assembly prologue in order to efficiently manipulate the stack +; at runtime. + +; Just to prevent the alloca from being optimized away. +declare void @dummy_use(i32*, i32) + +define {i32, i32} @test_basic(i32 %hp, i32 %p) { + ; X32-Linux: test_basic: + ; X32-Linux-NOT: calll inc_stack_0 + + ; X64-Linux: test_basic: + ; X64-Linux-NOT: callq inc_stack_0 + + %mem = alloca i32, i32 10 + call void @dummy_use (i32* %mem, i32 10) + %1 = insertvalue {i32, i32} undef, i32 %hp, 0 + %2 = insertvalue {i32, i32} %1, i32 %p, 1 + ret {i32, i32} %1 +} + +define cc 11 {i32, i32} @test_basic_hipecc(i32 %hp, i32 %p) { + ; X32-Linux: test_basic_hipecc: + ; X32-Linux: leal -156(%esp), %ebx + ; X32-Linux-NEXT: cmpl 76(%ebp), %ebx + ; X32-Linux-NEXT: jb .LBB1_1 + + ; X32-Linux: ret + + ; X32-Linux: .LBB1_1: + ; X32-Linux-NEXT: calll inc_stack_0 + + ; X64-Linux: test_basic_hipecc: + ; X64-Linux: leaq -232(%rsp), %r14 + ; X64-Linux-NEXT: cmpq 144(%rbp), %r14 + ; X64-Linux-NEXT: jb .LBB1_1 + + ; X64-Linux: ret + + ; X64-Linux: .LBB1_1: + ; X64-Linux-NEXT: callq inc_stack_0 + + %mem = alloca i32, i32 10 + call void @dummy_use (i32* %mem, i32 10) + %1 = insertvalue {i32, i32} undef, i32 %hp, 0 + %2 = insertvalue {i32, i32} %1, i32 %p, 1 + ret {i32, i32} %2 +} + +define cc 11 {i32,i32,i32} @test_nocall_hipecc(i32 %hp,i32 %p,i32 %x,i32 %y) { + ; X32-Linux: test_nocall_hipecc: + ; X32-Linux-NOT: calll inc_stack_0 + + ; X64-Linux: test_nocall_hipecc: + ; X64-Linux-NOT: callq inc_stack_0 + + %1 = add i32 %x, %y + %2 = mul i32 42, %1 + %3 = sub i32 24, %2 + %4 = insertvalue {i32, i32, i32} undef, i32 %hp, 0 + %5 = insertvalue {i32, i32, i32} %4, i32 %p, 1 + %6 = insertvalue {i32, i32, i32} %5, i32 %p, 2 + ret {i32, i32, i32} %6 +}