forked from OSchip/llvm-project
[ARM] Treat memcpy/memset/memmove as call instructions for low overhead loops
If an instruction will be lowered to a call there is no advantage of using a low overhead loop as the LR register will need to be spilled and reloaded around the call, and the low overhead will end up being reverted. This teaches our hardware loop lowering that these memory intrinsics will be calls under certain situations. Differential Revision: https://reviews.llvm.org/D90439
This commit is contained in:
parent
785080e3fa
commit
e474499402
|
@ -951,39 +951,85 @@ bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
|
||||||
(EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
|
(EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
|
||||||
}
|
}
|
||||||
|
|
||||||
int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
|
/// Given a memcpy/memset/memmove instruction, return the number of memory
|
||||||
const MemCpyInst *MI = dyn_cast<MemCpyInst>(I);
|
/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
|
||||||
assert(MI && "MemcpyInst expected");
|
/// call is used.
|
||||||
ConstantInt *C = dyn_cast<ConstantInt>(MI->getLength());
|
int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const {
|
||||||
|
MemOp MOp;
|
||||||
// To model the cost of a library call, we assume 1 for the call, and
|
unsigned DstAddrSpace = ~0u;
|
||||||
// 3 for the argument setup.
|
unsigned SrcAddrSpace = ~0u;
|
||||||
const unsigned LibCallCost = 4;
|
|
||||||
|
|
||||||
// If 'size' is not a constant, a library call will be generated.
|
|
||||||
if (!C)
|
|
||||||
return LibCallCost;
|
|
||||||
|
|
||||||
const unsigned Size = C->getValue().getZExtValue();
|
|
||||||
const Align DstAlign = *MI->getDestAlign();
|
|
||||||
const Align SrcAlign = *MI->getSourceAlign();
|
|
||||||
const Function *F = I->getParent()->getParent();
|
const Function *F = I->getParent()->getParent();
|
||||||
const unsigned Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
|
|
||||||
std::vector<EVT> MemOps;
|
if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
|
||||||
|
ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
|
||||||
|
// If 'size' is not a constant, a library call will be generated.
|
||||||
|
if (!C)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
const unsigned Size = C->getValue().getZExtValue();
|
||||||
|
const Align DstAlign = *MC->getDestAlign();
|
||||||
|
const Align SrcAlign = *MC->getSourceAlign();
|
||||||
|
const Function *F = I->getParent()->getParent();
|
||||||
|
std::vector<EVT> MemOps;
|
||||||
|
|
||||||
|
MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
|
||||||
|
/*IsVolatile*/ false);
|
||||||
|
DstAddrSpace = MC->getDestAddressSpace();
|
||||||
|
SrcAddrSpace = MC->getSourceAddressSpace();
|
||||||
|
}
|
||||||
|
else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
|
||||||
|
ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
|
||||||
|
// If 'size' is not a constant, a library call will be generated.
|
||||||
|
if (!C)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
const unsigned Size = C->getValue().getZExtValue();
|
||||||
|
const Align DstAlign = *MS->getDestAlign();
|
||||||
|
|
||||||
|
MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
|
||||||
|
/*IsZeroMemset*/ false, /*IsVolatile*/ false);
|
||||||
|
DstAddrSpace = MS->getDestAddressSpace();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
llvm_unreachable("Expected a memcpy/move or memset!");
|
||||||
|
|
||||||
|
unsigned Limit, Factor = 2;
|
||||||
|
switch(I->getIntrinsicID()) {
|
||||||
|
case Intrinsic::memcpy:
|
||||||
|
Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
|
||||||
|
break;
|
||||||
|
case Intrinsic::memmove:
|
||||||
|
Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
|
||||||
|
break;
|
||||||
|
case Intrinsic::memset:
|
||||||
|
Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
|
||||||
|
Factor = 1;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
llvm_unreachable("Expected a memcpy/move or memset!");
|
||||||
|
}
|
||||||
|
|
||||||
// MemOps will be poplulated with a list of data types that needs to be
|
// MemOps will be poplulated with a list of data types that needs to be
|
||||||
// loaded and stored. That's why we multiply the number of elements by 2 to
|
// loaded and stored. That's why we multiply the number of elements by 2 to
|
||||||
// get the cost for this memcpy.
|
// get the cost for this memcpy.
|
||||||
|
std::vector<EVT> MemOps;
|
||||||
if (getTLI()->findOptimalMemOpLowering(
|
if (getTLI()->findOptimalMemOpLowering(
|
||||||
MemOps, Limit,
|
MemOps, Limit, MOp, DstAddrSpace,
|
||||||
MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
|
SrcAddrSpace, F->getAttributes()))
|
||||||
/*IsVolatile*/ true),
|
return MemOps.size() * Factor;
|
||||||
MI->getDestAddressSpace(), MI->getSourceAddressSpace(),
|
|
||||||
F->getAttributes()))
|
|
||||||
return MemOps.size() * 2;
|
|
||||||
|
|
||||||
// If we can't find an optimal memop lowering, return the default cost
|
// If we can't find an optimal memop lowering, return the default cost
|
||||||
return LibCallCost;
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
|
||||||
|
int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
|
||||||
|
|
||||||
|
// To model the cost of a library call, we assume 1 for the call, and
|
||||||
|
// 3 for the argument setup.
|
||||||
|
if (NumOps == -1)
|
||||||
|
return 4;
|
||||||
|
return NumOps;
|
||||||
}
|
}
|
||||||
|
|
||||||
int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
|
int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
|
||||||
|
@ -1520,9 +1566,16 @@ bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) {
|
||||||
// Check if an intrinsic will be lowered to a call and assume that any
|
// Check if an intrinsic will be lowered to a call and assume that any
|
||||||
// other CallInst will generate a bl.
|
// other CallInst will generate a bl.
|
||||||
if (auto *Call = dyn_cast<CallInst>(&I)) {
|
if (auto *Call = dyn_cast<CallInst>(&I)) {
|
||||||
if (isa<IntrinsicInst>(Call)) {
|
if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
|
||||||
if (const Function *F = Call->getCalledFunction())
|
switch(II->getIntrinsicID()) {
|
||||||
return isLoweredToCall(F);
|
case Intrinsic::memcpy:
|
||||||
|
case Intrinsic::memset:
|
||||||
|
case Intrinsic::memmove:
|
||||||
|
return getNumMemOps(II) == -1;
|
||||||
|
default:
|
||||||
|
if (const Function *F = Call->getCalledFunction())
|
||||||
|
return isLoweredToCall(F);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -181,6 +181,8 @@ public:
|
||||||
|
|
||||||
int getMemcpyCost(const Instruction *I);
|
int getMemcpyCost(const Instruction *I);
|
||||||
|
|
||||||
|
int getNumMemOps(const IntrinsicInst *I) const;
|
||||||
|
|
||||||
int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
|
int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
|
||||||
VectorType *SubTp);
|
VectorType *SubTp);
|
||||||
|
|
||||||
|
|
|
@ -12,23 +12,20 @@ define void @test_memcpy(i32* nocapture %x, i32* nocapture readonly %y, i32 %n,
|
||||||
; CHECK-NEXT: blt .LBB0_3
|
; CHECK-NEXT: blt .LBB0_3
|
||||||
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
||||||
; CHECK-NEXT: mov r8, r3
|
; CHECK-NEXT: mov r8, r3
|
||||||
; CHECK-NEXT: mov lr, r2
|
; CHECK-NEXT: mov r5, r2
|
||||||
; CHECK-NEXT: mov r9, r1
|
; CHECK-NEXT: mov r9, r1
|
||||||
; CHECK-NEXT: mov r6, r0
|
; CHECK-NEXT: mov r7, r0
|
||||||
; CHECK-NEXT: lsls r7, r3, #2
|
; CHECK-NEXT: lsls r4, r3, #2
|
||||||
; CHECK-NEXT: movs r4, #0
|
; CHECK-NEXT: movs r6, #0
|
||||||
; CHECK-NEXT: .LBB0_2: @ %for.body
|
; CHECK-NEXT: .LBB0_2: @ %for.body
|
||||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||||
; CHECK-NEXT: adds r0, r6, r4
|
; CHECK-NEXT: adds r0, r7, r6
|
||||||
; CHECK-NEXT: add.w r1, r9, r4
|
; CHECK-NEXT: add.w r1, r9, r6
|
||||||
; CHECK-NEXT: mov r2, r8
|
; CHECK-NEXT: mov r2, r8
|
||||||
; CHECK-NEXT: mov r5, lr
|
|
||||||
; CHECK-NEXT: bl __aeabi_memcpy4
|
; CHECK-NEXT: bl __aeabi_memcpy4
|
||||||
; CHECK-NEXT: mov lr, r5
|
; CHECK-NEXT: add r6, r4
|
||||||
; CHECK-NEXT: add r4, r7
|
; CHECK-NEXT: subs r5, #1
|
||||||
; CHECK-NEXT: subs.w lr, lr, #1
|
|
||||||
; CHECK-NEXT: bne .LBB0_2
|
; CHECK-NEXT: bne .LBB0_2
|
||||||
; CHECK-NEXT: b .LBB0_3
|
|
||||||
; CHECK-NEXT: .LBB0_3: @ %for.cond.cleanup
|
; CHECK-NEXT: .LBB0_3: @ %for.cond.cleanup
|
||||||
; CHECK-NEXT: add sp, #4
|
; CHECK-NEXT: add sp, #4
|
||||||
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
|
||||||
|
@ -64,20 +61,17 @@ define void @test_memset(i32* nocapture %x, i32 %n, i32 %m) {
|
||||||
; CHECK-NEXT: blt .LBB1_3
|
; CHECK-NEXT: blt .LBB1_3
|
||||||
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
||||||
; CHECK-NEXT: mov r4, r2
|
; CHECK-NEXT: mov r4, r2
|
||||||
; CHECK-NEXT: mov lr, r1
|
; CHECK-NEXT: mov r5, r1
|
||||||
; CHECK-NEXT: mov r5, r0
|
; CHECK-NEXT: mov r6, r0
|
||||||
; CHECK-NEXT: lsls r6, r2, #2
|
; CHECK-NEXT: lsls r7, r2, #2
|
||||||
; CHECK-NEXT: .LBB1_2: @ %for.body
|
; CHECK-NEXT: .LBB1_2: @ %for.body
|
||||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||||
; CHECK-NEXT: mov r0, r5
|
; CHECK-NEXT: mov r0, r6
|
||||||
; CHECK-NEXT: mov r1, r4
|
; CHECK-NEXT: mov r1, r4
|
||||||
; CHECK-NEXT: mov r7, lr
|
|
||||||
; CHECK-NEXT: bl __aeabi_memclr4
|
; CHECK-NEXT: bl __aeabi_memclr4
|
||||||
; CHECK-NEXT: mov lr, r7
|
; CHECK-NEXT: add r6, r7
|
||||||
; CHECK-NEXT: add r5, r6
|
; CHECK-NEXT: subs r5, #1
|
||||||
; CHECK-NEXT: subs.w lr, lr, #1
|
|
||||||
; CHECK-NEXT: bne .LBB1_2
|
; CHECK-NEXT: bne .LBB1_2
|
||||||
; CHECK-NEXT: b .LBB1_3
|
|
||||||
; CHECK-NEXT: .LBB1_3: @ %for.cond.cleanup
|
; CHECK-NEXT: .LBB1_3: @ %for.cond.cleanup
|
||||||
; CHECK-NEXT: add sp, #4
|
; CHECK-NEXT: add sp, #4
|
||||||
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
|
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
|
||||||
|
@ -110,23 +104,20 @@ define void @test_memmove(i32* nocapture %x, i32* nocapture readonly %y, i32 %n,
|
||||||
; CHECK-NEXT: blt .LBB2_3
|
; CHECK-NEXT: blt .LBB2_3
|
||||||
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
||||||
; CHECK-NEXT: mov r8, r3
|
; CHECK-NEXT: mov r8, r3
|
||||||
; CHECK-NEXT: mov lr, r2
|
; CHECK-NEXT: mov r5, r2
|
||||||
; CHECK-NEXT: mov r9, r1
|
; CHECK-NEXT: mov r9, r1
|
||||||
; CHECK-NEXT: mov r6, r0
|
; CHECK-NEXT: mov r7, r0
|
||||||
; CHECK-NEXT: lsls r7, r3, #2
|
; CHECK-NEXT: lsls r4, r3, #2
|
||||||
; CHECK-NEXT: movs r4, #0
|
; CHECK-NEXT: movs r6, #0
|
||||||
; CHECK-NEXT: .LBB2_2: @ %for.body
|
; CHECK-NEXT: .LBB2_2: @ %for.body
|
||||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||||
; CHECK-NEXT: adds r0, r6, r4
|
; CHECK-NEXT: adds r0, r7, r6
|
||||||
; CHECK-NEXT: add.w r1, r9, r4
|
; CHECK-NEXT: add.w r1, r9, r6
|
||||||
; CHECK-NEXT: mov r2, r8
|
; CHECK-NEXT: mov r2, r8
|
||||||
; CHECK-NEXT: mov r5, lr
|
|
||||||
; CHECK-NEXT: bl __aeabi_memmove4
|
; CHECK-NEXT: bl __aeabi_memmove4
|
||||||
; CHECK-NEXT: mov lr, r5
|
; CHECK-NEXT: add r6, r4
|
||||||
; CHECK-NEXT: add r4, r7
|
; CHECK-NEXT: subs r5, #1
|
||||||
; CHECK-NEXT: subs.w lr, lr, #1
|
|
||||||
; CHECK-NEXT: bne .LBB2_2
|
; CHECK-NEXT: bne .LBB2_2
|
||||||
; CHECK-NEXT: b .LBB2_3
|
|
||||||
; CHECK-NEXT: .LBB2_3: @ %for.cond.cleanup
|
; CHECK-NEXT: .LBB2_3: @ %for.cond.cleanup
|
||||||
; CHECK-NEXT: add sp, #4
|
; CHECK-NEXT: add sp, #4
|
||||||
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
|
||||||
|
|
Loading…
Reference in New Issue