forked from OSchip/llvm-project
ARM: support struct byval in llvm
We handle struct byval by inserting a pseudo op, which will be expanded to a loop at ExpandISelPseudos. A separate patch for clang will be submitted to enable struct byval. rdar://9877866 llvm-svn: 157793
This commit is contained in:
parent
382dcfda20
commit
9f9111651e
|
@ -52,6 +52,7 @@ using namespace llvm;
|
||||||
|
|
||||||
STATISTIC(NumTailCalls, "Number of tail calls");
|
STATISTIC(NumTailCalls, "Number of tail calls");
|
||||||
STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
|
STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
|
||||||
|
STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
|
||||||
|
|
||||||
// This option should go away when tail calls fully work.
|
// This option should go away when tail calls fully work.
|
||||||
static cl::opt<bool>
|
static cl::opt<bool>
|
||||||
|
@ -1424,21 +1425,21 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
|
||||||
CCInfo.clearFirstByValReg();
|
CCInfo.clearFirstByValReg();
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned LocMemOffset = VA.getLocMemOffset();
|
if (Flags.getByValSize() - 4*offset > 0) {
|
||||||
SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset);
|
unsigned LocMemOffset = VA.getLocMemOffset();
|
||||||
SDValue Dst = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr,
|
SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset);
|
||||||
StkPtrOff);
|
SDValue Dst = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr,
|
||||||
SDValue SrcOffset = DAG.getIntPtrConstant(4*offset);
|
StkPtrOff);
|
||||||
SDValue Src = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, SrcOffset);
|
SDValue SrcOffset = DAG.getIntPtrConstant(4*offset);
|
||||||
SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset,
|
SDValue Src = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, SrcOffset);
|
||||||
MVT::i32);
|
SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset,
|
||||||
MemOpChains.push_back(DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode,
|
MVT::i32);
|
||||||
Flags.getByValAlign(),
|
|
||||||
/*isVolatile=*/false,
|
|
||||||
/*AlwaysInline=*/false,
|
|
||||||
MachinePointerInfo(0),
|
|
||||||
MachinePointerInfo(0)));
|
|
||||||
|
|
||||||
|
SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
|
||||||
|
SDValue Ops[] = { Chain, Dst, Src, SizeNode};
|
||||||
|
MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
|
||||||
|
Ops, array_lengthof(Ops)));
|
||||||
|
}
|
||||||
} else if (!IsSibCall) {
|
} else if (!IsSibCall) {
|
||||||
assert(VA.isMemLoc());
|
assert(VA.isMemLoc());
|
||||||
|
|
||||||
|
@ -6593,6 +6594,252 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
|
||||||
// return last added BB
|
// return last added BB
|
||||||
return SinkBB;
|
return SinkBB;
|
||||||
}
|
}
|
||||||
|
case ARM::COPY_STRUCT_BYVAL_I32: {
|
||||||
|
++NumLoopByVals;
|
||||||
|
// This pseudo instruction has 3 operands: dst, src, size
|
||||||
|
// We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
|
||||||
|
// Otherwise, we will generate unrolled scalar copies.
|
||||||
|
const BasicBlock *LLVM_BB = BB->getBasicBlock();
|
||||||
|
MachineFunction::iterator It = BB;
|
||||||
|
++It;
|
||||||
|
|
||||||
|
unsigned dest = MI->getOperand(0).getReg();
|
||||||
|
unsigned src = MI->getOperand(1).getReg();
|
||||||
|
unsigned size = MI->getOperand(2).getImm();
|
||||||
|
DebugLoc dl = MI->getDebugLoc();
|
||||||
|
unsigned BytesLeft = size & 3;
|
||||||
|
unsigned LoopSize = size - BytesLeft;
|
||||||
|
|
||||||
|
bool isThumb2 = Subtarget->isThumb2();
|
||||||
|
MachineFunction *MF = BB->getParent();
|
||||||
|
MachineRegisterInfo &MRI = MF->getRegInfo();
|
||||||
|
unsigned ldrOpc = isThumb2 ? ARM::t2LDR_POST : ARM::LDR_POST_IMM;
|
||||||
|
unsigned strOpc = isThumb2 ? ARM::t2STR_POST : ARM::STR_POST_IMM;
|
||||||
|
|
||||||
|
const TargetRegisterClass *TRC = isThumb2 ?
|
||||||
|
(const TargetRegisterClass*)&ARM::tGPRRegClass :
|
||||||
|
(const TargetRegisterClass*)&ARM::GPRRegClass;
|
||||||
|
|
||||||
|
if (size <= Subtarget->getMaxInlineSizeThreshold()) {
|
||||||
|
// Use LDR and STR to copy.
|
||||||
|
// [scratch, srcOut] = LDR_POST(srcIn, 4)
|
||||||
|
// [destOut] = STR_POST(scratch, destIn, 4)
|
||||||
|
unsigned srcIn = src;
|
||||||
|
unsigned destIn = dest;
|
||||||
|
for (unsigned i = 0; i < LoopSize; i+=4) {
|
||||||
|
unsigned scratch = MRI.createVirtualRegister(TRC);
|
||||||
|
unsigned srcOut = MRI.createVirtualRegister(TRC);
|
||||||
|
unsigned destOut = MRI.createVirtualRegister(TRC);
|
||||||
|
if (isThumb2) {
|
||||||
|
AddDefaultPred(BuildMI(*BB, MI, dl,
|
||||||
|
TII->get(ldrOpc), scratch)
|
||||||
|
.addReg(srcOut, RegState::Define).addReg(srcIn).addImm(4));
|
||||||
|
|
||||||
|
AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
|
||||||
|
.addReg(scratch).addReg(destIn)
|
||||||
|
.addImm(4));
|
||||||
|
} else {
|
||||||
|
AddDefaultPred(BuildMI(*BB, MI, dl,
|
||||||
|
TII->get(ldrOpc), scratch)
|
||||||
|
.addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0).addImm(4));
|
||||||
|
|
||||||
|
AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
|
||||||
|
.addReg(scratch).addReg(destIn)
|
||||||
|
.addReg(0).addImm(4));
|
||||||
|
}
|
||||||
|
srcIn = srcOut;
|
||||||
|
destIn = destOut;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle the leftover bytes with LDRB and STRB.
|
||||||
|
// [scratch, srcOut] = LDRB_POST(srcIn, 1)
|
||||||
|
// [destOut] = STRB_POST(scratch, destIn, 1)
|
||||||
|
ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
|
||||||
|
strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
|
||||||
|
for (unsigned i = 0; i < BytesLeft; i++) {
|
||||||
|
unsigned scratch = MRI.createVirtualRegister(TRC);
|
||||||
|
unsigned srcOut = MRI.createVirtualRegister(TRC);
|
||||||
|
unsigned destOut = MRI.createVirtualRegister(TRC);
|
||||||
|
if (isThumb2) {
|
||||||
|
AddDefaultPred(BuildMI(*BB, MI, dl,
|
||||||
|
TII->get(ldrOpc),scratch)
|
||||||
|
.addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1));
|
||||||
|
|
||||||
|
AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
|
||||||
|
.addReg(scratch).addReg(destIn)
|
||||||
|
.addReg(0).addImm(1));
|
||||||
|
} else {
|
||||||
|
AddDefaultPred(BuildMI(*BB, MI, dl,
|
||||||
|
TII->get(ldrOpc),scratch)
|
||||||
|
.addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1));
|
||||||
|
|
||||||
|
AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
|
||||||
|
.addReg(scratch).addReg(destIn)
|
||||||
|
.addReg(0).addImm(1));
|
||||||
|
}
|
||||||
|
srcIn = srcOut;
|
||||||
|
destIn = destOut;
|
||||||
|
}
|
||||||
|
MI->eraseFromParent(); // The instruction is gone now.
|
||||||
|
return BB;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Expand the pseudo op to a loop.
|
||||||
|
// thisMBB:
|
||||||
|
// ...
|
||||||
|
// movw varEnd, # --> with thumb2
|
||||||
|
// movt varEnd, #
|
||||||
|
// ldrcp varEnd, idx --> without thumb2
|
||||||
|
// fallthrough --> loopMBB
|
||||||
|
// loopMBB:
|
||||||
|
// PHI varPhi, varEnd, varLoop
|
||||||
|
// PHI srcPhi, src, srcLoop
|
||||||
|
// PHI destPhi, dst, destLoop
|
||||||
|
// [scratch, srcLoop] = LDR_POST(srcPhi, 4)
|
||||||
|
// [destLoop] = STR_POST(scratch, destPhi, 4)
|
||||||
|
// subs varLoop, varPhi, #4
|
||||||
|
// bne loopMBB
|
||||||
|
// fallthrough --> exitMBB
|
||||||
|
// exitMBB:
|
||||||
|
// epilogue to handle left-over bytes
|
||||||
|
// [scratch, srcOut] = LDRB_POST(srcLoop, 1)
|
||||||
|
// [destOut] = STRB_POST(scratch, destLoop, 1)
|
||||||
|
MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
|
||||||
|
MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
|
||||||
|
MF->insert(It, loopMBB);
|
||||||
|
MF->insert(It, exitMBB);
|
||||||
|
|
||||||
|
// Transfer the remainder of BB and its successor edges to exitMBB.
|
||||||
|
exitMBB->splice(exitMBB->begin(), BB,
|
||||||
|
llvm::next(MachineBasicBlock::iterator(MI)),
|
||||||
|
BB->end());
|
||||||
|
exitMBB->transferSuccessorsAndUpdatePHIs(BB);
|
||||||
|
|
||||||
|
// Load an immediate to varEnd.
|
||||||
|
unsigned varEnd = MRI.createVirtualRegister(TRC);
|
||||||
|
if (isThumb2) {
|
||||||
|
unsigned VReg1 = varEnd;
|
||||||
|
if ((LoopSize & 0xFFFF0000) != 0)
|
||||||
|
VReg1 = MRI.createVirtualRegister(TRC);
|
||||||
|
AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVi16), VReg1)
|
||||||
|
.addImm(LoopSize & 0xFFFF));
|
||||||
|
|
||||||
|
if ((LoopSize & 0xFFFF0000) != 0)
|
||||||
|
AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVTi16), varEnd)
|
||||||
|
.addReg(VReg1)
|
||||||
|
.addImm(LoopSize >> 16));
|
||||||
|
} else {
|
||||||
|
MachineConstantPool *ConstantPool = MF->getConstantPool();
|
||||||
|
Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
|
||||||
|
const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
|
||||||
|
|
||||||
|
// MachineConstantPool wants an explicit alignment.
|
||||||
|
unsigned Align = getTargetData()->getPrefTypeAlignment(Int32Ty);
|
||||||
|
if (Align == 0)
|
||||||
|
Align = getTargetData()->getTypeAllocSize(C->getType());
|
||||||
|
unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
|
||||||
|
|
||||||
|
AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::LDRcp))
|
||||||
|
.addReg(varEnd, RegState::Define)
|
||||||
|
.addConstantPoolIndex(Idx)
|
||||||
|
.addImm(0));
|
||||||
|
}
|
||||||
|
BB->addSuccessor(loopMBB);
|
||||||
|
|
||||||
|
// Generate the loop body:
|
||||||
|
// varPhi = PHI(varLoop, varEnd)
|
||||||
|
// srcPhi = PHI(srcLoop, src)
|
||||||
|
// destPhi = PHI(destLoop, dst)
|
||||||
|
MachineBasicBlock *entryBB = BB;
|
||||||
|
BB = loopMBB;
|
||||||
|
unsigned varLoop = MRI.createVirtualRegister(TRC);
|
||||||
|
unsigned varPhi = MRI.createVirtualRegister(TRC);
|
||||||
|
unsigned srcLoop = MRI.createVirtualRegister(TRC);
|
||||||
|
unsigned srcPhi = MRI.createVirtualRegister(TRC);
|
||||||
|
unsigned destLoop = MRI.createVirtualRegister(TRC);
|
||||||
|
unsigned destPhi = MRI.createVirtualRegister(TRC);
|
||||||
|
|
||||||
|
BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
|
||||||
|
.addReg(varLoop).addMBB(loopMBB)
|
||||||
|
.addReg(varEnd).addMBB(entryBB);
|
||||||
|
BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
|
||||||
|
.addReg(srcLoop).addMBB(loopMBB)
|
||||||
|
.addReg(src).addMBB(entryBB);
|
||||||
|
BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
|
||||||
|
.addReg(destLoop).addMBB(loopMBB)
|
||||||
|
.addReg(dest).addMBB(entryBB);
|
||||||
|
|
||||||
|
// [scratch, srcLoop] = LDR_POST(srcPhi, 4)
|
||||||
|
// [destLoop] = STR_POST(scratch, destPhi, 4)
|
||||||
|
unsigned scratch = MRI.createVirtualRegister(TRC);
|
||||||
|
if (isThumb2) {
|
||||||
|
AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
|
||||||
|
.addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(4));
|
||||||
|
|
||||||
|
AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
|
||||||
|
.addReg(scratch).addReg(destPhi)
|
||||||
|
.addImm(4));
|
||||||
|
} else {
|
||||||
|
AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
|
||||||
|
.addReg(srcLoop, RegState::Define).addReg(srcPhi).addReg(0).addImm(4));
|
||||||
|
|
||||||
|
AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
|
||||||
|
.addReg(scratch).addReg(destPhi)
|
||||||
|
.addReg(0).addImm(4));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Decrement loop variable by 4.
|
||||||
|
MachineInstrBuilder MIB = BuildMI(BB, dl,
|
||||||
|
TII->get(isThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
|
||||||
|
AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(4)));
|
||||||
|
MIB->getOperand(5).setReg(ARM::CPSR);
|
||||||
|
MIB->getOperand(5).setIsDef(true);
|
||||||
|
|
||||||
|
BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
|
||||||
|
.addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
|
||||||
|
|
||||||
|
// loopMBB can loop back to loopMBB or fall through to exitMBB.
|
||||||
|
BB->addSuccessor(loopMBB);
|
||||||
|
BB->addSuccessor(exitMBB);
|
||||||
|
|
||||||
|
// Add epilogue to handle BytesLeft.
|
||||||
|
BB = exitMBB;
|
||||||
|
MachineInstr *StartOfExit = exitMBB->begin();
|
||||||
|
ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
|
||||||
|
strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
|
||||||
|
|
||||||
|
// [scratch, srcOut] = LDRB_POST(srcLoop, 1)
|
||||||
|
// [destOut] = STRB_POST(scratch, destLoop, 1)
|
||||||
|
unsigned srcIn = srcLoop;
|
||||||
|
unsigned destIn = destLoop;
|
||||||
|
for (unsigned i = 0; i < BytesLeft; i++) {
|
||||||
|
unsigned scratch = MRI.createVirtualRegister(TRC);
|
||||||
|
unsigned srcOut = MRI.createVirtualRegister(TRC);
|
||||||
|
unsigned destOut = MRI.createVirtualRegister(TRC);
|
||||||
|
if (isThumb2) {
|
||||||
|
AddDefaultPred(BuildMI(*BB, StartOfExit, dl,
|
||||||
|
TII->get(ldrOpc),scratch)
|
||||||
|
.addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1));
|
||||||
|
|
||||||
|
AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut)
|
||||||
|
.addReg(scratch).addReg(destIn)
|
||||||
|
.addImm(1));
|
||||||
|
} else {
|
||||||
|
AddDefaultPred(BuildMI(*BB, StartOfExit, dl,
|
||||||
|
TII->get(ldrOpc),scratch)
|
||||||
|
.addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0).addImm(1));
|
||||||
|
|
||||||
|
AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut)
|
||||||
|
.addReg(scratch).addReg(destIn)
|
||||||
|
.addReg(0).addImm(1));
|
||||||
|
}
|
||||||
|
srcIn = srcOut;
|
||||||
|
destIn = destOut;
|
||||||
|
}
|
||||||
|
|
||||||
|
MI->eraseFromParent(); // The instruction is gone now.
|
||||||
|
return BB;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -41,6 +41,9 @@ namespace llvm {
|
||||||
// PIC mode.
|
// PIC mode.
|
||||||
WrapperJT, // WrapperJT - A wrapper node for TargetJumpTable
|
WrapperJT, // WrapperJT - A wrapper node for TargetJumpTable
|
||||||
|
|
||||||
|
// Add pseudo op to model memcpy for struct byval.
|
||||||
|
COPY_STRUCT_BYVAL,
|
||||||
|
|
||||||
CALL, // Function call.
|
CALL, // Function call.
|
||||||
CALL_PRED, // Function call that's predicable.
|
CALL_PRED, // Function call that's predicable.
|
||||||
CALL_NOLINK, // Function call with branch not branch-and-link.
|
CALL_NOLINK, // Function call with branch not branch-and-link.
|
||||||
|
|
|
@ -18,6 +18,9 @@
|
||||||
// Type profiles.
|
// Type profiles.
|
||||||
def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
|
def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
|
||||||
def SDT_ARMCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>;
|
def SDT_ARMCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>;
|
||||||
|
def SDT_ARMStructByVal : SDTypeProfile<0, 3,
|
||||||
|
[SDTCisVT<0, i32>, SDTCisVT<1, i32>,
|
||||||
|
SDTCisVT<2, i32>]>;
|
||||||
|
|
||||||
def SDT_ARMSaveCallPC : SDTypeProfile<0, 1, []>;
|
def SDT_ARMSaveCallPC : SDTypeProfile<0, 1, []>;
|
||||||
|
|
||||||
|
@ -90,6 +93,10 @@ def ARMcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_ARMCallSeqStart,
|
||||||
[SDNPHasChain, SDNPOutGlue]>;
|
[SDNPHasChain, SDNPOutGlue]>;
|
||||||
def ARMcallseq_end : SDNode<"ISD::CALLSEQ_END", SDT_ARMCallSeqEnd,
|
def ARMcallseq_end : SDNode<"ISD::CALLSEQ_END", SDT_ARMCallSeqEnd,
|
||||||
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
|
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
|
||||||
|
def ARMcopystructbyval : SDNode<"ARMISD::COPY_STRUCT_BYVAL" ,
|
||||||
|
SDT_ARMStructByVal,
|
||||||
|
[SDNPHasChain, SDNPInGlue, SDNPOutGlue,
|
||||||
|
SDNPMayStore, SDNPMayLoad]>;
|
||||||
|
|
||||||
def ARMcall : SDNode<"ARMISD::CALL", SDT_ARMcall,
|
def ARMcall : SDNode<"ARMISD::CALL", SDT_ARMcall,
|
||||||
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
|
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
|
||||||
|
@ -4165,6 +4172,13 @@ let usesCustomInserter = 1 in {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let usesCustomInserter = 1 in {
|
||||||
|
def COPY_STRUCT_BYVAL_I32 : PseudoInst<
|
||||||
|
(outs), (ins GPR:$dst, GPR:$src, i32imm:$size),
|
||||||
|
NoItinerary,
|
||||||
|
[(ARMcopystructbyval GPR:$dst, GPR:$src, imm:$size)]>;
|
||||||
|
}
|
||||||
|
|
||||||
let mayLoad = 1 in {
|
let mayLoad = 1 in {
|
||||||
def LDREXB : AIldrex<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
|
def LDREXB : AIldrex<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
|
||||||
NoItinerary,
|
NoItinerary,
|
||||||
|
|
Loading…
Reference in New Issue