diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 0c892bb97f92..899165449f1e 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -951,39 +951,85 @@ bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) { (EltWidth == 16 && Alignment >= 2) || EltWidth == 8); } -int ARMTTIImpl::getMemcpyCost(const Instruction *I) { - const MemCpyInst *MI = dyn_cast(I); - assert(MI && "MemcpyInst expected"); - ConstantInt *C = dyn_cast(MI->getLength()); - - // To model the cost of a library call, we assume 1 for the call, and - // 3 for the argument setup. - const unsigned LibCallCost = 4; - - // If 'size' is not a constant, a library call will be generated. - if (!C) - return LibCallCost; - - const unsigned Size = C->getValue().getZExtValue(); - const Align DstAlign = *MI->getDestAlign(); - const Align SrcAlign = *MI->getSourceAlign(); +/// Given a memcpy/memset/memmove instruction, return the number of memory +/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a +/// call is used. +int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const { + MemOp MOp; + unsigned DstAddrSpace = ~0u; + unsigned SrcAddrSpace = ~0u; const Function *F = I->getParent()->getParent(); - const unsigned Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize()); - std::vector MemOps; + + if (const auto *MC = dyn_cast(I)) { + ConstantInt *C = dyn_cast(MC->getLength()); + // If 'size' is not a constant, a library call will be generated. + if (!C) + return -1; + + const unsigned Size = C->getValue().getZExtValue(); + const Align DstAlign = *MC->getDestAlign(); + const Align SrcAlign = *MC->getSourceAlign(); + const Function *F = I->getParent()->getParent(); + std::vector MemOps; + + MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign, + /*IsVolatile*/ false); + DstAddrSpace = MC->getDestAddressSpace(); + SrcAddrSpace = MC->getSourceAddressSpace(); + } + else if (const auto *MS = dyn_cast(I)) { + ConstantInt *C = dyn_cast(MS->getLength()); + // If 'size' is not a constant, a library call will be generated. + if (!C) + return -1; + + const unsigned Size = C->getValue().getZExtValue(); + const Align DstAlign = *MS->getDestAlign(); + + MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign, + /*IsZeroMemset*/ false, /*IsVolatile*/ false); + DstAddrSpace = MS->getDestAddressSpace(); + } + else + llvm_unreachable("Expected a memcpy/move or memset!"); + + unsigned Limit, Factor = 2; + switch(I->getIntrinsicID()) { + case Intrinsic::memcpy: + Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize()); + break; + case Intrinsic::memmove: + Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize()); + break; + case Intrinsic::memset: + Limit = TLI->getMaxStoresPerMemset(F->hasMinSize()); + Factor = 1; + break; + default: + llvm_unreachable("Expected a memcpy/move or memset!"); + } // MemOps will be poplulated with a list of data types that needs to be // loaded and stored. That's why we multiply the number of elements by 2 to // get the cost for this memcpy. + std::vector MemOps; if (getTLI()->findOptimalMemOpLowering( - MemOps, Limit, - MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign, - /*IsVolatile*/ true), - MI->getDestAddressSpace(), MI->getSourceAddressSpace(), - F->getAttributes())) - return MemOps.size() * 2; + MemOps, Limit, MOp, DstAddrSpace, + SrcAddrSpace, F->getAttributes())) + return MemOps.size() * Factor; // If we can't find an optimal memop lowering, return the default cost - return LibCallCost; + return -1; +} + +int ARMTTIImpl::getMemcpyCost(const Instruction *I) { + int NumOps = getNumMemOps(cast(I)); + + // To model the cost of a library call, we assume 1 for the call, and + // 3 for the argument setup. + if (NumOps == -1) + return 4; + return NumOps; } int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, @@ -1520,9 +1566,16 @@ bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) { // Check if an intrinsic will be lowered to a call and assume that any // other CallInst will generate a bl. if (auto *Call = dyn_cast(&I)) { - if (isa(Call)) { - if (const Function *F = Call->getCalledFunction()) - return isLoweredToCall(F); + if (auto *II = dyn_cast(Call)) { + switch(II->getIntrinsicID()) { + case Intrinsic::memcpy: + case Intrinsic::memset: + case Intrinsic::memmove: + return getNumMemOps(II) == -1; + default: + if (const Function *F = Call->getCalledFunction()) + return isLoweredToCall(F); + } } return true; } diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index 5eddcf4ec802..3898272ed168 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -181,6 +181,8 @@ public: int getMemcpyCost(const Instruction *I); + int getNumMemOps(const IntrinsicInst *I) const; + int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, VectorType *SubTp); diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll index 1ab0b606b4ac..8a4665a19a16 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll @@ -12,23 +12,20 @@ define void @test_memcpy(i32* nocapture %x, i32* nocapture readonly %y, i32 %n, ; CHECK-NEXT: blt .LBB0_3 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: mov r8, r3 -; CHECK-NEXT: mov lr, r2 +; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: mov r9, r1 -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: lsls r7, r3, #2 -; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: lsls r4, r3, #2 +; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: .LBB0_2: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r0, r6, r4 -; CHECK-NEXT: add.w r1, r9, r4 +; CHECK-NEXT: adds r0, r7, r6 +; CHECK-NEXT: add.w r1, r9, r6 ; CHECK-NEXT: mov r2, r8 -; CHECK-NEXT: mov r5, lr ; CHECK-NEXT: bl __aeabi_memcpy4 -; CHECK-NEXT: mov lr, r5 -; CHECK-NEXT: add r4, r7 -; CHECK-NEXT: subs.w lr, lr, #1 +; CHECK-NEXT: add r6, r4 +; CHECK-NEXT: subs r5, #1 ; CHECK-NEXT: bne .LBB0_2 -; CHECK-NEXT: b .LBB0_3 ; CHECK-NEXT: .LBB0_3: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} @@ -64,20 +61,17 @@ define void @test_memset(i32* nocapture %x, i32 %n, i32 %m) { ; CHECK-NEXT: blt .LBB1_3 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: mov r4, r2 -; CHECK-NEXT: mov lr, r1 -; CHECK-NEXT: mov r5, r0 -; CHECK-NEXT: lsls r6, r2, #2 +; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: lsls r7, r2, #2 ; CHECK-NEXT: .LBB1_2: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: mov r7, lr ; CHECK-NEXT: bl __aeabi_memclr4 -; CHECK-NEXT: mov lr, r7 -; CHECK-NEXT: add r5, r6 -; CHECK-NEXT: subs.w lr, lr, #1 +; CHECK-NEXT: add r6, r7 +; CHECK-NEXT: subs r5, #1 ; CHECK-NEXT: bne .LBB1_2 -; CHECK-NEXT: b .LBB1_3 ; CHECK-NEXT: .LBB1_3: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} @@ -110,23 +104,20 @@ define void @test_memmove(i32* nocapture %x, i32* nocapture readonly %y, i32 %n, ; CHECK-NEXT: blt .LBB2_3 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: mov r8, r3 -; CHECK-NEXT: mov lr, r2 +; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: mov r9, r1 -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: lsls r7, r3, #2 -; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: lsls r4, r3, #2 +; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: .LBB2_2: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r0, r6, r4 -; CHECK-NEXT: add.w r1, r9, r4 +; CHECK-NEXT: adds r0, r7, r6 +; CHECK-NEXT: add.w r1, r9, r6 ; CHECK-NEXT: mov r2, r8 -; CHECK-NEXT: mov r5, lr ; CHECK-NEXT: bl __aeabi_memmove4 -; CHECK-NEXT: mov lr, r5 -; CHECK-NEXT: add r4, r7 -; CHECK-NEXT: subs.w lr, lr, #1 +; CHECK-NEXT: add r6, r4 +; CHECK-NEXT: subs r5, #1 ; CHECK-NEXT: bne .LBB2_2 -; CHECK-NEXT: b .LBB2_3 ; CHECK-NEXT: .LBB2_3: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}