[ARM] Transforming memcpy to Tail predicated Loop

This patch converts llvm.memcpy intrinsic into Tail Predicated
Hardware loops for a target that supports the Arm M-profile
Vector Extension (MVE).

From an implementation point of view, the patch

- adds an ARM specific SDAG Node (to which the llvm.memcpy intrinsic is lowered to, during first phase of ISel)
- adds a corresponding TableGen entry to generate a pseudo instruction, with a custom inserter,
  on matching the above node.
- Adds a custom inserter function that expands the pseudo instruction into MIR suitable
   to be (by later passes) into a WLSTP loop.

Note: A cli option is used to control the conversion of memcpy to TP
loop and this option is currently disabled by default. It may be enabled
in the future after further downstream testing.

Reviewed By: dmgreen

Differential Revision: https://reviews.llvm.org/D99723
This commit is contained in:
Malhar Jajoo 2021-05-06 01:38:20 +01:00
parent abe2c906ad
commit b856f4a232
9 changed files with 730 additions and 24 deletions

View File

@ -1802,6 +1802,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(ARMISD::CSINV)
MAKE_CASE(ARMISD::CSNEG)
MAKE_CASE(ARMISD::CSINC)
MAKE_CASE(ARMISD::MEMCPYLOOP)
#undef MAKE_CASE
}
return nullptr;
@ -11097,6 +11098,141 @@ static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr,
return true;
}
/// Adds logic in loop entry MBB to calculate loop iteration count and adds
/// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop
static Register genTPEntry(MachineBasicBlock *TpEntry,
MachineBasicBlock *TpLoopBody,
MachineBasicBlock *TpExit, Register OpSizeReg,
const TargetInstrInfo *TII, DebugLoc Dl,
MachineRegisterInfo &MRI) {
// Calculates loop iteration count = ceil(n/16)/16 = ((n + 15)&(-16)) / 16.
Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)
.addUse(OpSizeReg)
.addImm(15)
.add(predOps(ARMCC::AL))
.addReg(0);
Register BicDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
BuildMI(TpEntry, Dl, TII->get(ARM::t2BICri), BicDestReg)
.addUse(AddDestReg, RegState::Kill)
.addImm(16)
.add(predOps(ARMCC::AL))
.addReg(0);
Register LsrDestReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg)
.addUse(BicDestReg, RegState::Kill)
.addImm(4)
.add(predOps(ARMCC::AL))
.addReg(0);
Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg)
.addUse(LsrDestReg, RegState::Kill);
BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart))
.addUse(TotalIterationsReg)
.addMBB(TpExit);
return TotalIterationsReg;
}
/// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and
/// t2DoLoopEnd. These are used by later passes to generate tail predicated
/// loops.
static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit,
const TargetInstrInfo *TII, DebugLoc Dl,
MachineRegisterInfo &MRI, Register OpSrcReg,
Register OpDestReg, Register ElementCountReg,
Register TotalIterationsReg) {
// First insert 4 PHI nodes for: Current pointer to Src, Dest array, loop
// iteration counter, predication counter Current position in the src array
Register SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
Register CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
.addUse(OpSrcReg)
.addMBB(TpEntry)
.addUse(CurrSrcReg)
.addMBB(TpLoopBody);
// Current position in the dest array
Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg)
.addUse(OpDestReg)
.addMBB(TpEntry)
.addUse(CurrDestReg)
.addMBB(TpLoopBody);
// Current loop counter
Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
Register RemainingLoopIterationsReg =
MRI.createVirtualRegister(&ARM::GPRlrRegClass);
BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg)
.addUse(TotalIterationsReg)
.addMBB(TpEntry)
.addUse(RemainingLoopIterationsReg)
.addMBB(TpLoopBody);
// Predication counter
Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg)
.addUse(ElementCountReg)
.addMBB(TpEntry)
.addUse(RemainingElementsReg)
.addMBB(TpLoopBody);
// Pass predication counter to VCTP
Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass);
BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg)
.addUse(PredCounterPhiReg)
.addImm(ARMVCC::None)
.addReg(0);
BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)
.addUse(PredCounterPhiReg)
.addImm(16)
.add(predOps(ARMCC::AL))
.addReg(0);
// VLDRB and VSTRB instructions, predicated using VPR
Register LoadedValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
.addDef(CurrSrcReg)
.addDef(LoadedValueReg)
.addReg(SrcPhiReg)
.addImm(16)
.addImm(ARMVCC::Then)
.addUse(VccrReg);
BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))
.addDef(CurrDestReg)
.addUse(LoadedValueReg, RegState::Kill)
.addReg(DestPhiReg)
.addImm(16)
.addImm(ARMVCC::Then)
.addUse(VccrReg);
// Add the pseudoInstrs for decrementing the loop counter and marking the
// end:t2DoLoopDec and t2DoLoopEnd
BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg)
.addUse(LoopCounterPhiReg)
.addImm(1);
BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd))
.addUse(RemainingLoopIterationsReg)
.addMBB(TpLoopBody);
BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B))
.addMBB(TpExit)
.add(predOps(ARMCC::AL));
}
MachineBasicBlock *
ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const {
@ -11123,6 +11259,91 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return BB;
}
case ARM::MVE_MEMCPYLOOPINST: {
// Transformation below expands MVE_MEMCPYLOOPINST Pseudo instruction
// into a Tail Predicated (TP) Loop. It adds the instructions to calculate
// the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
// adds the relevant instructions in the TP loop Body for generation of a
// WLSTP loop.
// Below is relevant portion of the CFG after the transformation.
// The Machine Basic Blocks are shown along with branch conditions (in
// brackets). Note that TP entry/exit MBBs depict the entry/exit of this
// portion of the CFG and may not necessarily be the entry/exit of the
// function.
// (Relevant) CFG after transformation:
// TP entry MBB
// |
// |-----------------|
// (n <= 0) (n > 0)
// | |
// | TP loop Body MBB<--|
// | | |
// \ |___________|
// \ /
// TP exit MBB
MachineFunction *MF = BB->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
Register OpDestReg = MI.getOperand(0).getReg();
Register OpSrcReg = MI.getOperand(1).getReg();
Register OpSizeReg = MI.getOperand(2).getReg();
// Allocate the required MBBs and add to parent function.
MachineBasicBlock *TpEntry = BB;
MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock();
MachineBasicBlock *TpExit;
MF->push_back(TpLoopBody);
// If any instructions are present in the current block after
// MVE_MEMCPYLOOPINST, split the current block and move the instructions
// into the newly created exit block. If there are no instructions
// add an explicit branch to the FallThrough block and then split.
//
// The split is required for two reasons:
// 1) A terminator(t2WhileLoopStart) will be placed at that site.
// 2) Since a TPLoopBody will be added later, any phis in successive blocks
// need to be updated. splitAt() already handles this.
TpExit = BB->splitAt(MI, false);
if (TpExit == BB) {
assert(BB->canFallThrough() &&
"Exit block must be FallThrough of the block containing memcpy");
TpExit = BB->getFallThrough();
BuildMI(BB, dl, TII->get(ARM::t2B))
.addMBB(TpExit)
.add(predOps(ARMCC::AL));
TpExit = BB->splitAt(MI, false);
}
// Add logic for iteration count
Register TotalIterationsReg =
genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI);
// Add the vectorized (and predicated) loads/store instructions
genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg,
OpDestReg, OpSizeReg, TotalIterationsReg);
// Connect the blocks
TpEntry->addSuccessor(TpLoopBody);
TpLoopBody->addSuccessor(TpLoopBody);
TpLoopBody->addSuccessor(TpExit);
// Reorder for a more natural layout
TpLoopBody->moveAfter(TpEntry);
TpExit->moveAfter(TpLoopBody);
// Finally, remove the memcpy Psuedo Instruction
MI.eraseFromParent();
// Return the exit block as it may contain other instructions requiring a
// custom inserter
return TpExit;
}
// The Thumb2 pre-indexed stores have the same MI operands, they just
// define them differently in the .td files from the isel patterns, so
// they need pseudos.

View File

@ -300,6 +300,10 @@ class VectorType;
// instructions.
MEMCPY,
// Pseudo-instruction representing a memory copy using a tail predicated
// loop
MEMCPYLOOP,
// V8.1MMainline condition select
CSINV, // Conditional select invert.
CSNEG, // Conditional select negate.

View File

@ -6865,6 +6865,18 @@ class MVE_WLSTP<string asm, bits<2> size>
let isTerminator = 1;
}
def SDT_MVEMEMCPYLOOPNODE
: SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, SDTCisVT<2, i32>]>;
def MVE_MEMCPYLOOPNODE : SDNode<"ARMISD::MEMCPYLOOP", SDT_MVEMEMCPYLOOPNODE,
[SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
def MVE_MEMCPYLOOPINST : PseudoInst<(outs),
(ins rGPR:$dst, rGPR:$src, rGPR:$sz),
NoItinerary,
[(MVE_MEMCPYLOOPNODE rGPR:$dst, rGPR:$src, rGPR:$sz)]>;
}
def MVE_DLSTP_8 : MVE_DLSTP<"dlstp.8", 0b00>;
def MVE_DLSTP_16 : MVE_DLSTP<"dlstp.16", 0b01>;
def MVE_DLSTP_32 : MVE_DLSTP<"dlstp.32", 0b10>;

View File

@ -11,12 +11,27 @@
//===----------------------------------------------------------------------===//
#include "ARMTargetMachine.h"
#include "ARMTargetTransformInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/Support/CommandLine.h"
using namespace llvm;
#define DEBUG_TYPE "arm-selectiondag-info"
cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop(
"arm-memtransfer-tploop", cl::Hidden,
cl::desc("Control conversion of memcpy to "
"Tail predicated loops (WLSTP)"),
cl::init(TPLoop::ForceDisabled),
cl::values(clEnumValN(TPLoop::ForceDisabled, "force-disabled",
"Don't convert memcpy to TP loop."),
clEnumValN(TPLoop::ForceEnabled, "force-enabled",
"Always convert memcpy to TP loop."),
clEnumValN(TPLoop::Allow, "allow",
"Allow (may be subject to certain conditions) "
"conversion of memcpy to TP loop.")));
// Emit, if possible, a specialized version of the given Libcall. Typically this
// means selecting the appropriately aligned version, but we also convert memset
// of 0 into memclr.
@ -130,13 +145,40 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
const ARMSubtarget &Subtarget =
DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
auto GenInlineTP = [&](const ARMSubtarget &Subtarget,
const SelectionDAG &DAG) {
auto &F = DAG.getMachineFunction().getFunction();
if (!EnableMemtransferTPLoop)
return false;
if (EnableMemtransferTPLoop == TPLoop::ForceEnabled)
return true;
// Do not generate inline TP loop if optimizations is disabled,
// or if optimization for size (-Os or -Oz) is on.
if (F.hasOptNone() || F.hasOptSize())
return false;
// If cli option is unset
if (!ConstantSize && Alignment >= Align(4))
return true;
if (ConstantSize &&
ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() &&
ConstantSize->getZExtValue() <
Subtarget.getMaxTPLoopInlineSizeThreshold())
return true;
return false;
};
if (Subtarget.hasMVEIntegerOps() && GenInlineTP(Subtarget, DAG))
return DAG.getNode(ARMISD::MEMCPYLOOP, dl, MVT::Other, Chain, Dst, Src,
DAG.getZExtOrTrunc(Size, dl, MVT::i32));
// Do repeated 4-byte loads and stores. To be improved.
// This requires 4-byte alignment.
if (Alignment < Align(4))
return SDValue();
// This requires the copy size to be a constant, preferably
// within a subtarget-specific limit.
ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
if (!ConstantSize)
return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
Alignment.value(), RTLIB::MEMCPY);

View File

@ -538,6 +538,11 @@ public:
return 64;
}
/// getMaxTPLoopSizeThreshold - Returns the maximum memcpy size
/// that still makes it profitable to inline the call as a Tail
/// Predicated loop
unsigned getMaxTPLoopInlineSizeThreshold() const { return 128; }
/// ParseSubtargetFeatures - Parses features string setting specified
/// subtarget options. Definition of function is auto generated by tblgen.
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);

View File

@ -48,6 +48,11 @@ namespace TailPredication {
};
}
// For controlling conversion of memcpy into Tail Predicated loop.
namespace TPLoop {
enum MemTransfer { ForceDisabled = 0, ForceEnabled, Allow };
}
class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
using BaseT = BasicTTIImplBase<ARMTTIImpl>;
using TTI = TargetTransformInfo;

View File

@ -1,34 +1,39 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled -o - %s | FileCheck %s
; RUN: llc --arm-memtransfer-tploop=allow -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled -o - %s | FileCheck %s
define void @test_memcpy(i32* nocapture %x, i32* nocapture readonly %y, i32 %n, i32 %m) {
; CHECK-LABEL: test_memcpy:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: blt .LBB0_3
; CHECK-NEXT: blt .LBB0_5
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: mov r8, r3
; CHECK-NEXT: mov r5, r2
; CHECK-NEXT: mov r9, r1
; CHECK-NEXT: mov r7, r0
; CHECK-NEXT: lsls r4, r3, #2
; CHECK-NEXT: movs r6, #0
; CHECK-NEXT: lsl.w r12, r3, #2
; CHECK-NEXT: movs r7, #0
; CHECK-NEXT: b .LBB0_2
; CHECK-NEXT: .LBB0_2: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r0, r7, r6
; CHECK-NEXT: add.w r1, r9, r6
; CHECK-NEXT: mov r2, r8
; CHECK-NEXT: bl __aeabi_memcpy4
; CHECK-NEXT: add r6, r4
; CHECK-NEXT: subs r5, #1
; CHECK-NEXT: bne .LBB0_2
; CHECK-NEXT: .LBB0_3: @ %for.cond.cleanup
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB0_4 Depth 2
; CHECK-NEXT: adds r4, r1, r7
; CHECK-NEXT: adds r5, r0, r7
; CHECK-NEXT: mov r6, r3
; CHECK-NEXT: wlstp.8 lr, r6, .LBB0_3
; CHECK-NEXT: b .LBB0_4
; CHECK-NEXT: .LBB0_3: @ %for.body
; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1
; CHECK-NEXT: add r7, r12
; CHECK-NEXT: subs r2, #1
; CHECK-NEXT: beq .LBB0_5
; CHECK-NEXT: b .LBB0_2
; CHECK-NEXT: .LBB0_4: @ Parent Loop BB0_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vldrb.u8 q0, [r4], #16
; CHECK-NEXT: vstrb.8 q0, [r5], #16
; CHECK-NEXT: letp lr, .LBB0_4
; CHECK-NEXT: b .LBB0_3
; CHECK-NEXT: .LBB0_5: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
entry:
%cmp8 = icmp sgt i32 %n, 0
br i1 %cmp8, label %for.body, label %for.cond.cleanup

View File

@ -0,0 +1,285 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc --arm-memtransfer-tploop=allow -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve --verify-machineinstrs %s -o - | FileCheck %s
; Check that WLSTP loop is not generated for alignment < 4
; void test1(char* dest, char* src, int n){
; memcpy(dest, src, n);
; }
declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg) #1
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1
define void @test1(i8* noalias nocapture %X, i8* noalias nocapture readonly %Y, i32 %n){
; CHECK-LABEL: test1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: bl __aeabi_memcpy
; CHECK-NEXT: pop {r7, pc}
entry:
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %X, i8* align 1 %Y, i32 %n, i1 false)
ret void
}
; Check that WLSTP loop is generated for alignment >= 4
; void test2(int* restrict X, int* restrict Y, int n){
; memcpy(X, Y, n);
; }
define void @test2(i32* noalias %X, i32* noalias readonly %Y, i32 %n){
; CHECK-LABEL: test2:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: wlstp.8 lr, r2, .LBB1_2
; CHECK-NEXT: .LBB1_1: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrb.u8 q0, [r1], #16
; CHECK-NEXT: vstrb.8 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB1_1
; CHECK-NEXT: .LBB1_2: @ %entry
; CHECK-NEXT: pop {r7, pc}
entry:
%0 = bitcast i32* %X to i8*
%1 = bitcast i32* %Y to i8*
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 %n, i1 false)
ret void
}
; Checks that transform handles some arithmetic on the input arguments.
; void test3(int* restrict X, int* restrict Y, int n)
; {
; memcpy(X+2, Y+3, (n*2)+10);
; }
define void @test3(i32* noalias nocapture %X, i32* noalias nocapture readonly %Y, i32 %n) {
; CHECK-LABEL: test3:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: movs r3, #10
; CHECK-NEXT: add.w r2, r3, r2, lsl #1
; CHECK-NEXT: adds r1, #12
; CHECK-NEXT: adds r0, #8
; CHECK-NEXT: wlstp.8 lr, r2, .LBB2_2
; CHECK-NEXT: .LBB2_1: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrb.u8 q0, [r1], #16
; CHECK-NEXT: vstrb.8 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB2_1
; CHECK-NEXT: .LBB2_2: @ %entry
; CHECK-NEXT: pop {r7, pc}
entry:
%add.ptr = getelementptr inbounds i32, i32* %X, i32 2
%0 = bitcast i32* %add.ptr to i8*
%add.ptr1 = getelementptr inbounds i32, i32* %Y, i32 3
%1 = bitcast i32* %add.ptr1 to i8*
%mul = shl nsw i32 %n, 1
%add = add nsw i32 %mul, 10
call void @llvm.memcpy.p0i8.p0i8.i32(i8* nonnull align 4 %0, i8* nonnull align 4 %1, i32 %add, i1 false)
ret void
}
; Checks that transform handles for loops that are implicitly converted to mempcy
; void test4(int* restrict X, int* restrict Y, int n){
; for(int i = 0; i < n; ++i){
; X[i] = Y[i];
; }
; }
define void @test4(i32* noalias %X, i32* noalias readonly %Y, i32 %n) {
; CHECK-LABEL: test4:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: bxlt lr
; CHECK-NEXT: .LBB3_1: @ %for.body.preheader
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: wlstp.8 lr, r2, .LBB3_3
; CHECK-NEXT: .LBB3_2: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrb.u8 q0, [r1], #16
; CHECK-NEXT: vstrb.8 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB3_2
; CHECK-NEXT: .LBB3_3: @ %for.body.preheader
; CHECK-NEXT: pop.w {r7, lr}
; CHECK-NEXT: bx lr
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%X.bits = bitcast i32* %X to i8*
%Y.bits = bitcast i32* %Y to i8*
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %X.bits, i8* align 4 %Y.bits, i32 %n, i1 false)
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.body.preheader, %entry
ret void
}
; Checks that transform can handle > i32 size inputs
define void @test5(i8* noalias %X, i8* noalias %Y, i64 %n){
; CHECK-LABEL: test5:
; CHECK: @ %bb.0:
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: wlstp.8 lr, r2, .LBB4_2
; CHECK-NEXT: .LBB4_1: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrb.u8 q0, [r1], #16
; CHECK-NEXT: vstrb.8 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB4_1
; CHECK-NEXT: .LBB4_2:
; CHECK-NEXT: pop {r7, pc}
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %X, i8* align 4 %Y, i64 %n, i1 false)
ret void
}
; Checks the transform is applied for constant size inputs below a certain threshold (128 in this case)
define void @test6(i32* noalias nocapture %X, i32* noalias nocapture readonly %Y, i32 %n) {
; CHECK-LABEL: test6:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: movs r2, #127
; CHECK-NEXT: wlstp.8 lr, r2, .LBB5_2
; CHECK-NEXT: .LBB5_1: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrb.u8 q0, [r1], #16
; CHECK-NEXT: vstrb.8 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB5_1
; CHECK-NEXT: .LBB5_2: @ %entry
; CHECK-NEXT: pop {r7, pc}
entry:
%0 = bitcast i32* %X to i8*
%1 = bitcast i32* %Y to i8*
call void @llvm.memcpy.p0i8.p0i8.i32(i8* noundef nonnull align 4 dereferenceable(127) %0, i8* noundef nonnull align 4 dereferenceable(127) %1, i32 127, i1 false)
ret void
}
; Checks the transform is NOT applied for constant size inputs above a certain threshold (128 in this case)
define void @test7(i32* noalias nocapture %X, i32* noalias nocapture readonly %Y, i32 %n) {
; CHECK-LABEL: test7:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: movs r2, #128
; CHECK-NEXT: bl __aeabi_memcpy4
; CHECK-NEXT: pop {r7, pc}
entry:
%0 = bitcast i32* %X to i8*
%1 = bitcast i32* %Y to i8*
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 128, i1 false)
ret void
}
; Checks the transform is NOT applied for constant size inputs below a certain threshold (64 in this case)
define void @test8(i32* noalias nocapture %X, i32* noalias nocapture readonly %Y, i32 %n) {
; CHECK-LABEL: test8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: ldm.w r1!, {r2, r3, r4, r12, lr}
; CHECK-NEXT: stm.w r0!, {r2, r3, r4, r12, lr}
; CHECK-NEXT: ldm.w r1!, {r2, r3, r4, r12, lr}
; CHECK-NEXT: stm.w r0!, {r2, r3, r4, r12, lr}
; CHECK-NEXT: ldm.w r1, {r2, r3, r4, r12, lr}
; CHECK-NEXT: stm.w r0, {r2, r3, r4, r12, lr}
; CHECK-NEXT: pop {r4, pc}
entry:
%0 = bitcast i32* %X to i8*
%1 = bitcast i32* %Y to i8*
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 60, i1 false)
ret void
}
; Checks the transform is NOT applied (regardless of alignment) when optimizations are disabled
define void @test9(i32* noalias nocapture %X, i32* noalias nocapture readonly %Y, i32 %n) #0 {
; CHECK-LABEL: test9:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: bl __aeabi_memcpy4
; CHECK-NEXT: pop {r7, pc}
entry:
%0 = bitcast i32* %X to i8*
%1 = bitcast i32* %Y to i8*
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 %n, i1 false)
ret void
}
; Checks the transform is NOT applied (regardless of alignment) when optimization for size is on (-Os or -Oz)
define void @test10(i32* noalias nocapture %X, i32* noalias nocapture readonly %Y, i32 %n) #1 {
; CHECK-LABEL: test10:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: bl __aeabi_memcpy4
; CHECK-NEXT: pop {r7, pc}
entry:
%0 = bitcast i32* %X to i8*
%1 = bitcast i32* %Y to i8*
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 %n, i1 false)
ret void
}
define void @test11(i8* nocapture %x, i8* nocapture %y, i32 %n) {
; CHECK-LABEL: test11:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: cmp.w r2, #-1
; CHECK-NEXT: it gt
; CHECK-NEXT: popgt {r4, pc}
; CHECK-NEXT: .LBB10_1: @ %prehead
; CHECK-NEXT: add.w r3, r2, #15
; CHECK-NEXT: mov r12, r1
; CHECK-NEXT: bic r3, r3, #16
; CHECK-NEXT: mov r4, r0
; CHECK-NEXT: lsr.w lr, r3, #4
; CHECK-NEXT: mov r3, r2
; CHECK-NEXT: subs.w lr, lr, #0
; CHECK-NEXT: beq .LBB10_3
; CHECK-NEXT: .LBB10_2: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.8 r3
; CHECK-NEXT: subs r3, #16
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrbt.u8 q0, [r12], #16
; CHECK-NEXT: vstrbt.8 q0, [r4], #16
; CHECK-NEXT: subs.w lr, lr, #1
; CHECK-NEXT: bne .LBB10_2
; CHECK-NEXT: b .LBB10_3
; CHECK-NEXT: .LBB10_3: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldrb r3, [r0], #1
; CHECK-NEXT: subs r2, #2
; CHECK-NEXT: strb r3, [r1], #1
; CHECK-NEXT: bne .LBB10_3
; CHECK-NEXT: @ %bb.4: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, pc}
entry:
%cmp6 = icmp slt i32 %n, 0
br i1 %cmp6, label %prehead, label %for.cond.cleanup
prehead: ; preds = %entry
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %x, i8* align 4 %y, i32 %n, i1 false)
br label %for.body
for.body: ; preds = %for.body, %prehead
%i.09 = phi i32 [ %inc, %for.body ], [ 0, %prehead ]
%x.addr.08 = phi i8* [ %add.ptr, %for.body ], [ %x, %prehead ]
%y.addr.07 = phi i8* [ %add.ptr1, %for.body ], [ %y, %prehead ]
%add.ptr = getelementptr inbounds i8, i8* %x.addr.08, i32 1
%add.ptr1 = getelementptr inbounds i8, i8* %y.addr.07, i32 1
%l = load i8, i8* %x.addr.08, align 1
store i8 %l, i8* %y.addr.07, align 1
%inc = add nuw nsw i32 %i.09, 2
%exitcond.not = icmp eq i32 %inc, %n
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %entry
ret void
}
attributes #0 = { noinline optnone }
attributes #1 = { optsize }

View File

@ -0,0 +1,127 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve -simplify-mir -run-pass=finalize-isel %s -o - | FileCheck %s
--- |
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "arm-arm-none-eabi"
; Function Attrs: argmemonly nofree nosync nounwind willreturn
declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg)
define void @test1(i32* noalias %X, i32* noalias readonly %Y, i32 %n) {
entry:
%0 = bitcast i32* %X to i8*
%1 = bitcast i32* %Y to i8*
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 %n, i1 false)
ret void
}
define void @test2(i32* noalias %X, i32* noalias readonly %Y, i32 %n) {
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%X.bits = bitcast i32* %X to i8*
%Y.bits = bitcast i32* %Y to i8*
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %X.bits, i8* align 4 %Y.bits, i32 %n, i1 false)
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.body.preheader, %entry
ret void
}
...
---
name: test1
tracksRegLiveness: true
body: |
bb.0.entry:
liveins: $r0, $r1, $r2
; CHECK-LABEL: name: test1
; CHECK: liveins: $r0, $r1, $r2
; CHECK: [[COPY:%[0-9]+]]:rgpr = COPY $r2
; CHECK: [[COPY1:%[0-9]+]]:rgpr = COPY $r1
; CHECK: [[COPY2:%[0-9]+]]:rgpr = COPY $r0
; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY]], 15, 14 /* CC::al */, $noreg, $noreg
; CHECK: [[t2BICri:%[0-9]+]]:rgpr = t2BICri killed [[t2ADDri]], 16, 14 /* CC::al */, $noreg, $noreg
; CHECK: [[t2LSRri:%[0-9]+]]:gprlr = t2LSRri killed [[t2BICri]], 4, 14 /* CC::al */, $noreg, $noreg
; CHECK: [[t2WhileLoopSetup:%[0-9]+]]:gprlr = t2WhileLoopSetup killed [[t2LSRri]]
; CHECK: t2WhileLoopStart [[t2WhileLoopSetup]], %bb.2, implicit-def $cpsr
; CHECK: .1:
; CHECK: [[PHI:%[0-9]+]]:rgpr = PHI [[COPY1]], %bb.0, %8, %bb.1
; CHECK: [[PHI1:%[0-9]+]]:rgpr = PHI [[COPY2]], %bb.0, %10, %bb.1
; CHECK: [[PHI2:%[0-9]+]]:gprlr = PHI [[t2WhileLoopSetup]], %bb.0, %12, %bb.1
; CHECK: [[PHI3:%[0-9]+]]:rgpr = PHI [[COPY]], %bb.0, %14, %bb.1
; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI3]], 0, $noreg
; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI3]], 16, 14 /* CC::al */, $noreg, $noreg
; CHECK: [[MVE_VLDRBU8_post:%[0-9]+]]:rgpr, [[MVE_VLDRBU8_post1:%[0-9]+]]:mqpr = MVE_VLDRBU8_post [[PHI]], 16, 1, [[MVE_VCTP8_]]
; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post killed [[MVE_VLDRBU8_post1]], [[PHI1]], 16, 1, [[MVE_VCTP8_]]
; CHECK: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI2]], 1
; CHECK: t2LoopEnd [[t2LoopDec]], %bb.1, implicit-def $cpsr
; CHECK: t2B %bb.2, 14 /* CC::al */, $noreg
; CHECK: .2.entry:
; CHECK: tBX_RET 14 /* CC::al */, $noreg
%2:rgpr = COPY $r2
%1:rgpr = COPY $r1
%0:rgpr = COPY $r0
MVE_MEMCPYLOOPINST %0, %1, %2
tBX_RET 14 /* CC::al */, $noreg
...
---
name: test2
tracksRegLiveness: true
body: |
; CHECK-LABEL: name: test2
; CHECK: bb.0.entry:
; CHECK: successors: %bb.1(0x50000000), %bb.2(0x30000000)
; CHECK: liveins: $r0, $r1, $r2
; CHECK: [[COPY:%[0-9]+]]:rgpr = COPY $r2
; CHECK: [[COPY1:%[0-9]+]]:rgpr = COPY $r1
; CHECK: [[COPY2:%[0-9]+]]:rgpr = COPY $r0
; CHECK: t2CMPri [[COPY]], 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
; CHECK: t2Bcc %bb.2, 11 /* CC::lt */, $cpsr
; CHECK: t2B %bb.1, 14 /* CC::al */, $noreg
; CHECK: bb.1.for.body.preheader:
; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY]], 15, 14 /* CC::al */, $noreg, $noreg
; CHECK: [[t2BICri:%[0-9]+]]:rgpr = t2BICri killed [[t2ADDri]], 16, 14 /* CC::al */, $noreg, $noreg
; CHECK: [[t2LSRri:%[0-9]+]]:gprlr = t2LSRri killed [[t2BICri]], 4, 14 /* CC::al */, $noreg, $noreg
; CHECK: [[t2WhileLoopSetup:%[0-9]+]]:gprlr = t2WhileLoopSetup killed [[t2LSRri]]
; CHECK: t2WhileLoopStart [[t2WhileLoopSetup]], %bb.4, implicit-def $cpsr
; CHECK: bb.3:
; CHECK: [[PHI:%[0-9]+]]:rgpr = PHI [[COPY1]], %bb.1, %8, %bb.3
; CHECK: [[PHI1:%[0-9]+]]:rgpr = PHI [[COPY2]], %bb.1, %10, %bb.3
; CHECK: [[PHI2:%[0-9]+]]:gprlr = PHI [[t2WhileLoopSetup]], %bb.1, %12, %bb.3
; CHECK: [[PHI3:%[0-9]+]]:rgpr = PHI [[COPY]], %bb.1, %14, %bb.3
; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI3]], 0, $noreg
; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI3]], 16, 14 /* CC::al */, $noreg, $noreg
; CHECK: [[MVE_VLDRBU8_post:%[0-9]+]]:rgpr, [[MVE_VLDRBU8_post1:%[0-9]+]]:mqpr = MVE_VLDRBU8_post [[PHI]], 16, 1, [[MVE_VCTP8_]]
; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post killed [[MVE_VLDRBU8_post1]], [[PHI1]], 16, 1, [[MVE_VCTP8_]]
; CHECK: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI2]], 1
; CHECK: t2LoopEnd [[t2LoopDec]], %bb.3, implicit-def $cpsr
; CHECK: t2B %bb.4, 14 /* CC::al */, $noreg
; CHECK: bb.4.for.body.preheader:
; CHECK: t2B %bb.2, 14 /* CC::al */, $noreg
; CHECK: bb.2.for.cond.cleanup:
; CHECK: tBX_RET 14 /* CC::al */, $noreg
bb.0.entry:
successors: %bb.1(0x50000000), %bb.2(0x30000000)
liveins: $r0, $r1, $r2
%2:rgpr = COPY $r2
%1:rgpr = COPY $r1
%0:rgpr = COPY $r0
t2CMPri %2, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
t2Bcc %bb.2, 11 /* CC::lt */, $cpsr
t2B %bb.1, 14 /* CC::al */, $noreg
bb.1.for.body.preheader:
successors: %bb.2(0x80000000)
MVE_MEMCPYLOOPINST %0, %1, %2
bb.2.for.cond.cleanup:
tBX_RET 14 /* CC::al */, $noreg
...