diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index 173b57c62a2f..98e6b4ca6bd0 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -788,6 +788,49 @@ bool HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2, const ARMSubtarget *Subtarget, bool ForCodesize = false); +// Return the immediate if this is ADDri or SUBri, scaled as appropriate. +// Returns 0 for unknown instructions. +inline int getAddSubImmediate(MachineInstr &MI) { + int Scale = 1; + unsigned ImmOp; + switch (MI.getOpcode()) { + case ARM::t2ADDri: + ImmOp = 2; + break; + case ARM::t2SUBri: + case ARM::t2SUBri12: + ImmOp = 2; + Scale = -1; + break; + case ARM::tSUBi3: + case ARM::tSUBi8: + ImmOp = 3; + Scale = -1; + break; + default: + return 0; + } + return Scale * MI.getOperand(ImmOp).getImm(); +} + +// Given a memory access Opcode, check that the give Imm would be a valid Offset +// for this instruction using its addressing mode. +inline bool isLegalAddressImm(unsigned Opcode, int Imm, + const TargetInstrInfo *TII) { + const MCInstrDesc &Desc = TII->get(Opcode); + unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); + switch (AddrMode) { + case ARMII::AddrModeT2_i7: + return std::abs(Imm) < (((1 << 7) * 1) - 1); + case ARMII::AddrModeT2_i7s2: + return std::abs(Imm) < (((1 << 7) * 2) - 1) && Imm % 2 == 0; + case ARMII::AddrModeT2_i7s4: + return std::abs(Imm) < (((1 << 7) * 4) - 1) && Imm % 4 == 0; + default: + llvm_unreachable("Unhandled Addressing mode"); + } +} + } // end namespace llvm #endif // LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index 852e3d7cbca3..d38da836eb8c 100644 --- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -32,6 +32,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" @@ -50,6 +51,7 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/Pass.h" #include "llvm/Support/Allocator.h" @@ -1383,6 +1385,38 @@ static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc, case ARM::t2STRi8: case ARM::t2STRi12: return ARM::t2STR_POST; + + case ARM::MVE_VLDRBS16: + return ARM::MVE_VLDRBS16_post; + case ARM::MVE_VLDRBS32: + return ARM::MVE_VLDRBS32_post; + case ARM::MVE_VLDRBU16: + return ARM::MVE_VLDRBU16_post; + case ARM::MVE_VLDRBU32: + return ARM::MVE_VLDRBU32_post; + case ARM::MVE_VLDRHS32: + return ARM::MVE_VLDRHS32_post; + case ARM::MVE_VLDRHU32: + return ARM::MVE_VLDRHU32_post; + case ARM::MVE_VLDRBU8: + return ARM::MVE_VLDRBU8_post; + case ARM::MVE_VLDRHU16: + return ARM::MVE_VLDRHU16_post; + case ARM::MVE_VLDRWU32: + return ARM::MVE_VLDRWU32_post; + case ARM::MVE_VSTRB16: + return ARM::MVE_VSTRB16_post; + case ARM::MVE_VSTRB32: + return ARM::MVE_VSTRB32_post; + case ARM::MVE_VSTRH32: + return ARM::MVE_VSTRH32_post; + case ARM::MVE_VSTRBU8: + return ARM::MVE_VSTRBU8_post; + case ARM::MVE_VSTRHU16: + return ARM::MVE_VSTRHU16_post; + case ARM::MVE_VSTRWU32: + return ARM::MVE_VSTRWU32_post; + default: llvm_unreachable("Unhandled opcode!"); } } @@ -2046,6 +2080,7 @@ namespace { const TargetRegisterInfo *TRI; const ARMSubtarget *STI; MachineRegisterInfo *MRI; + MachineDominatorTree *DT; MachineFunction *MF; ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {} @@ -2058,6 +2093,8 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -2071,14 +2108,19 @@ namespace { unsigned Base, bool isLd, DenseMap &MI2LocMap); bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB); + bool DistributeIncrements(); + bool DistributeIncrements(Register Base); }; } // end anonymous namespace char ARMPreAllocLoadStoreOpt::ID = 0; -INITIALIZE_PASS(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt", - ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false) +INITIALIZE_PASS_BEGIN(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt", + ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt", + ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false) // Limit the number of instructions to be rescheduled. // FIXME: tune this limit, and/or come up with some better heuristics. @@ -2094,10 +2136,11 @@ bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { TII = STI->getInstrInfo(); TRI = STI->getRegisterInfo(); MRI = &Fn.getRegInfo(); + DT = &getAnalysis(); MF = &Fn; AA = &getAnalysis().getAAResults(); - bool Modified = false; + bool Modified = DistributeIncrements(); for (MachineBasicBlock &MFI : Fn) Modified |= RescheduleLoadStoreInstrs(&MFI); @@ -2475,6 +2518,198 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { return RetVal; } +// Get the Base register operand index from the memory access MachineInst if we +// should attempt to distribute postinc on it. Return -1 if not of a valid +// instruction type. If it returns an index, it is assumed that instruction is a +// r+i indexing mode, and getBaseOperandIndex() + 1 is the Offset index. +static int getBaseOperandIndex(MachineInstr &MI) { + switch (MI.getOpcode()) { + case ARM::MVE_VLDRBS16: + case ARM::MVE_VLDRBS32: + case ARM::MVE_VLDRBU16: + case ARM::MVE_VLDRBU32: + case ARM::MVE_VLDRHS32: + case ARM::MVE_VLDRHU32: + case ARM::MVE_VLDRBU8: + case ARM::MVE_VLDRHU16: + case ARM::MVE_VLDRWU32: + case ARM::MVE_VSTRB16: + case ARM::MVE_VSTRB32: + case ARM::MVE_VSTRH32: + case ARM::MVE_VSTRBU8: + case ARM::MVE_VSTRHU16: + case ARM::MVE_VSTRWU32: + return 1; + } + return -1; +} + +static MachineInstr *createPostIncLoadStore(MachineInstr *MI, int Offset, + Register NewReg, + const TargetInstrInfo *TII, + const TargetRegisterInfo *TRI) { + MachineFunction *MF = MI->getMF(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + unsigned NewOpcode = getPostIndexedLoadStoreOpcode( + MI->getOpcode(), Offset > 0 ? ARM_AM::add : ARM_AM::sub); + + const MCInstrDesc &MCID = TII->get(NewOpcode); + // Constrain the def register class + const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI, *MF); + MRI.constrainRegClass(NewReg, TRC); + // And do the same for the base operand + TRC = TII->getRegClass(MCID, 2, TRI, *MF); + MRI.constrainRegClass(MI->getOperand(1).getReg(), TRC); + + return BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID) + .addReg(NewReg, RegState::Define) + .add(MI->getOperand(0)) + .add(MI->getOperand(1)) + .addImm(Offset) + .add(MI->getOperand(3)) + .add(MI->getOperand(4)) + .cloneMemRefs(*MI); +} + +// Given a Base Register, optimise the load/store uses to attempt to create more +// post-inc accesses. We do this by taking zero offset loads/stores with an add, +// and convert them to a postinc load/store of the same type. Any subsequent +// accesses will be adjusted to use and account for the post-inc value. +// For example: +// LDR #0 LDR_POSTINC #16 +// LDR #4 LDR #-12 +// LDR #8 LDR #-8 +// LDR #12 LDR #-4 +// ADD #16 +bool ARMPreAllocLoadStoreOpt::DistributeIncrements(Register Base) { + // We are looking for: + // One zero offset load/store that can become postinc + MachineInstr *BaseAccess = nullptr; + // An increment that can be folded in + MachineInstr *Increment = nullptr; + // Other accesses after BaseAccess that will need to be updated to use the + // postinc value + SmallPtrSet OtherAccesses; + for (auto &Use : MRI->use_nodbg_instructions(Base)) { + if (!Increment && getAddSubImmediate(Use) != 0) { + Increment = &Use; + continue; + } + + int BaseOp = getBaseOperandIndex(Use); + if (BaseOp == -1) + return false; + + if (!Use.getOperand(BaseOp).isReg() || + Use.getOperand(BaseOp).getReg() != Base) + return false; + if (Use.getOperand(BaseOp + 1).getImm() == 0) + BaseAccess = &Use; + else + OtherAccesses.insert(&Use); + } + + if (!BaseAccess || !Increment || + BaseAccess->getParent() != Increment->getParent()) + return false; + Register PredReg; + if (Increment->definesRegister(ARM::CPSR) || + getInstrPredicate(*Increment, PredReg) != ARMCC::AL) + return false; + + LLVM_DEBUG(dbgs() << "\nAttempting to distribute increments on VirtualReg " + << Base.virtRegIndex() << "\n"); + + // Make sure that Increment has no uses before BaseAccess. + for (MachineInstr &Use : + MRI->use_nodbg_instructions(Increment->getOperand(0).getReg())) { + if (!DT->dominates(BaseAccess, &Use) || &Use == BaseAccess) { + LLVM_DEBUG(dbgs() << " BaseAccess doesn't dominate use of increment\n"); + return false; + } + } + + // Make sure that Increment can be folded into Base + int IncrementOffset = getAddSubImmediate(*Increment); + unsigned NewPostIncOpcode = getPostIndexedLoadStoreOpcode( + BaseAccess->getOpcode(), IncrementOffset > 0 ? ARM_AM::add : ARM_AM::sub); + if (!isLegalAddressImm(NewPostIncOpcode, IncrementOffset, TII)) { + LLVM_DEBUG(dbgs() << " Illegal addressing mode immediate on postinc\n"); + return false; + } + + // And make sure that the negative value of increment can be added to all + // other offsets after the BaseAccess. We rely on either + // dominates(BaseAccess, OtherAccess) or dominates(OtherAccess, BaseAccess) + // to keep things simple. + SmallPtrSet SuccessorAccesses; + for (auto *Use : OtherAccesses) { + if (DT->dominates(BaseAccess, Use)) { + SuccessorAccesses.insert(Use); + unsigned BaseOp = getBaseOperandIndex(*Use); + if (!isLegalAddressImm( + Use->getOpcode(), + Use->getOperand(BaseOp + 1).getImm() - IncrementOffset, TII)) { + LLVM_DEBUG(dbgs() << " Illegal addressing mode immediate on use\n"); + return false; + } + } else if (!DT->dominates(Use, BaseAccess)) { + LLVM_DEBUG( + dbgs() << " Unknown dominance relation between Base and Use\n"); + return false; + } + } + + // Replace BaseAccess with a post inc + LLVM_DEBUG(dbgs() << "Changing: "; BaseAccess->dump()); + LLVM_DEBUG(dbgs() << " And : "; Increment->dump()); + Register NewBaseReg = Increment->getOperand(0).getReg(); + MachineInstr *BaseAccessPost = + createPostIncLoadStore(BaseAccess, IncrementOffset, NewBaseReg, TII, TRI); + BaseAccess->eraseFromParent(); + Increment->eraseFromParent(); + LLVM_DEBUG(dbgs() << " To : "; BaseAccessPost->dump()); + + for (auto *Use : SuccessorAccesses) { + LLVM_DEBUG(dbgs() << "Changing: "; Use->dump()); + unsigned BaseOp = getBaseOperandIndex(*Use); + Use->getOperand(BaseOp).setReg(NewBaseReg); + int OldOffset = Use->getOperand(BaseOp + 1).getImm(); + Use->getOperand(BaseOp + 1).setImm(OldOffset - IncrementOffset); + LLVM_DEBUG(dbgs() << " To : "; Use->dump()); + } + + // Remove the kill flag from all uses of NewBaseReg, in case any old uses + // remain. + for (MachineOperand &Op : MRI->use_nodbg_operands(NewBaseReg)) + Op.setIsKill(false); + return true; +} + +bool ARMPreAllocLoadStoreOpt::DistributeIncrements() { + bool Changed = false; + SmallSetVector Visited; + for (auto &MBB : *MF) { + for (auto &MI : MBB) { + int BaseOp = getBaseOperandIndex(MI); + if (BaseOp == -1 || !MI.getOperand(BaseOp).isReg()) + continue; + + Register Base = MI.getOperand(BaseOp).getReg(); + if (!Base.isVirtual() || Visited.count(Base)) + continue; + + Visited.insert(Base); + } + } + + for (auto Base : Visited) + Changed |= DistributeIncrements(Base); + + return Changed; +} + /// Returns an instance of the load / store optimization pass. FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) { if (PreAlloc) diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp index 4ccf62759f5a..44ddb4e2f801 100644 --- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -463,21 +463,8 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) { // that the predication will be equivalent. For this we need: // NumElements = NumElements - VectorWidth. The sub will be a sub immediate // and we can also allow register copies within the chain too. - auto IsValidSub = [](MachineInstr *MI, unsigned ExpectedVecWidth) { - unsigned ImmOpIdx = 0; - switch (MI->getOpcode()) { - default: - llvm_unreachable("unhandled sub opcode"); - case ARM::tSUBi3: - case ARM::tSUBi8: - ImmOpIdx = 3; - break; - case ARM::t2SUBri: - case ARM::t2SUBri12: - ImmOpIdx = 2; - break; - } - return MI->getOperand(ImmOpIdx).getImm() == ExpectedVecWidth; + auto IsValidSub = [](MachineInstr *MI, int ExpectedVecWidth) { + return -getAddSubImmediate(*MI) == ExpectedVecWidth; }; MBB = VCTP->getParent(); diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll index f37b0863f8d7..60e9932e95f6 100644 --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -94,6 +94,7 @@ ; CHECK-NEXT: Remove dead machine instructions ; CHECK-NEXT: MVE VPT Optimisation Pass ; CHECK-NEXT: ARM MLA / MLS expansion pass +; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: ARM pre- register allocation load / store optimization pass ; CHECK-NEXT: ARM A15 S->D optimizer ; CHECK-NEXT: Detect Dead Lanes diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll index f6d1d9ab8839..e86d3606f27f 100644 --- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -1057,8 +1057,8 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: .pad #40 -; CHECK-NEXT: sub sp, #40 +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: cmp r3, #8 ; CHECK-NEXT: blo.w .LBB16_12 ; CHECK-NEXT: @ %bb.1: @ %if.then @@ -1066,130 +1066,124 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc ; CHECK-NEXT: cmp.w r7, r3, lsr #2 ; CHECK-NEXT: beq.w .LBB16_12 ; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph -; CHECK-NEXT: ldrh r4, [r0] -; CHECK-NEXT: movs r5, #1 -; CHECK-NEXT: ldrd r6, r12, [r0, #4] -; CHECK-NEXT: lsrs r3, r3, #2 -; CHECK-NEXT: sub.w r0, r4, #8 +; CHECK-NEXT: ldrh r5, [r0] +; CHECK-NEXT: lsr.w r9, r3, #2 +; CHECK-NEXT: ldrd r8, r12, [r0, #4] +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: sub.w r0, r5, #8 ; CHECK-NEXT: add.w r7, r0, r0, lsr #29 ; CHECK-NEXT: and r0, r0, #7 -; CHECK-NEXT: asr.w lr, r7, #3 -; CHECK-NEXT: cmp.w lr, #1 +; CHECK-NEXT: asrs r6, r7, #3 +; CHECK-NEXT: cmp r6, #1 ; CHECK-NEXT: it gt -; CHECK-NEXT: asrgt r5, r7, #3 -; CHECK-NEXT: add.w r7, r6, r4, lsl #2 +; CHECK-NEXT: asrgt r3, r7, #3 +; CHECK-NEXT: add.w r7, r8, r5, lsl #2 ; CHECK-NEXT: sub.w r11, r7, #4 -; CHECK-NEXT: rsbs r7, r4, #0 -; CHECK-NEXT: str r7, [sp, #16] @ 4-byte Spill -; CHECK-NEXT: add.w r7, r12, #32 -; CHECK-NEXT: str r0, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: str r3, [sp] @ 4-byte Spill +; CHECK-NEXT: rsbs r3, r5, #0 +; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: add.w r3, r12, #32 +; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: adds r0, #1 -; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: str r4, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: str r7, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: str r5, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: b .LBB16_4 ; CHECK-NEXT: .LBB16_3: @ %while.end ; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldrd r11, r3, [sp, #28] @ 8-byte Folded Reload +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: subs.w r9, r9, #1 +; CHECK-NEXT: ldrd r11, r1, [sp, #24] @ 8-byte Folded Reload ; CHECK-NEXT: vstrb.8 q0, [r2], #16 -; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload -; CHECK-NEXT: subs r3, #1 ; CHECK-NEXT: add.w r0, r8, r0, lsl #2 -; CHECK-NEXT: add.w r6, r0, #16 +; CHECK-NEXT: add.w r8, r0, #16 ; CHECK-NEXT: beq .LBB16_12 ; CHECK-NEXT: .LBB16_4: @ %while.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB16_6 Depth 2 ; CHECK-NEXT: @ Child Loop BB16_10 Depth 2 -; CHECK-NEXT: add.w lr, r12, #12 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 -; CHECK-NEXT: ldm.w r12, {r0, r5, r7} -; CHECK-NEXT: ldm.w lr, {r4, r9, lr} -; CHECK-NEXT: ldrd r8, r10, [r12, #24] +; CHECK-NEXT: ldrd r0, r7, [r12] +; CHECK-NEXT: ldrd r4, r6, [r12, #8] +; CHECK-NEXT: ldrd r5, r3, [r12, #16] +; CHECK-NEXT: ldrd lr, r10, [r12, #24] ; CHECK-NEXT: vstrb.8 q0, [r11], #16 -; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: vldrw.u32 q1, [r6, #4] -; CHECK-NEXT: vldrw.u32 q6, [r6, #8] -; CHECK-NEXT: vldrw.u32 q4, [r6, #12] +; CHECK-NEXT: vldrw.u32 q0, [r8], #32 +; CHECK-NEXT: vldrw.u32 q1, [r8, #-28] ; CHECK-NEXT: vmul.f32 q0, q0, r0 -; CHECK-NEXT: vldrw.u32 q5, [r6, #16] -; CHECK-NEXT: vfma.f32 q0, q1, r5 -; CHECK-NEXT: vldrw.u32 q2, [r6, #20] -; CHECK-NEXT: vfma.f32 q0, q6, r7 -; CHECK-NEXT: vldrw.u32 q3, [r6, #24] -; CHECK-NEXT: vfma.f32 q0, q4, r4 -; CHECK-NEXT: vldrw.u32 q1, [r6, #28] -; CHECK-NEXT: vfma.f32 q0, q5, r9 -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: vfma.f32 q0, q2, lr -; CHECK-NEXT: add.w r5, r6, #32 -; CHECK-NEXT: vfma.f32 q0, q3, r8 -; CHECK-NEXT: cmp r0, #16 +; CHECK-NEXT: vldrw.u32 q6, [r8, #-24] +; CHECK-NEXT: vldrw.u32 q4, [r8, #-20] +; CHECK-NEXT: vfma.f32 q0, q1, r7 +; CHECK-NEXT: vldrw.u32 q5, [r8, #-16] +; CHECK-NEXT: vfma.f32 q0, q6, r4 +; CHECK-NEXT: vldrw.u32 q2, [r8, #-12] +; CHECK-NEXT: vfma.f32 q0, q4, r6 +; CHECK-NEXT: vldrw.u32 q3, [r8, #-8] +; CHECK-NEXT: vfma.f32 q0, q5, r5 +; CHECK-NEXT: vldrw.u32 q1, [r8, #-4] +; CHECK-NEXT: vfma.f32 q0, q2, r3 +; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: vfma.f32 q0, q3, lr +; CHECK-NEXT: strd r11, r1, [sp, #24] @ 8-byte Folded Spill ; CHECK-NEXT: vfma.f32 q0, q1, r10 -; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill -; CHECK-NEXT: strd r11, r3, [sp, #28] @ 8-byte Folded Spill +; CHECK-NEXT: cmp r0, #16 ; CHECK-NEXT: blo .LBB16_7 ; CHECK-NEXT: @ %bb.5: @ %for.body.preheader ; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 -; CHECK-NEXT: ldr.w lr, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr.w lr, [sp] @ 4-byte Reload ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: ldr r6, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: .LBB16_6: @ %for.body ; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldm.w r6, {r0, r3, r4, r7, r10, r11} -; CHECK-NEXT: vldrw.u32 q1, [r5] -; CHECK-NEXT: vldrw.u32 q6, [r5, #8] -; CHECK-NEXT: vldrw.u32 q4, [r5, #12] -; CHECK-NEXT: vldrw.u32 q5, [r5, #16] -; CHECK-NEXT: vldrw.u32 q2, [r5, #20] +; CHECK-NEXT: ldm.w r7, {r0, r3, r4, r5, r6, r10, r11} +; CHECK-NEXT: vldrw.u32 q1, [r8], #32 +; CHECK-NEXT: vldrw.u32 q6, [r8, #-24] +; CHECK-NEXT: vldrw.u32 q4, [r8, #-20] ; CHECK-NEXT: vfma.f32 q0, q1, r0 -; CHECK-NEXT: vldrw.u32 q1, [r5, #4] -; CHECK-NEXT: ldrd r1, r9, [r6, #24] -; CHECK-NEXT: vldrw.u32 q3, [r5, #24] +; CHECK-NEXT: vldrw.u32 q1, [r8, #-28] +; CHECK-NEXT: vldrw.u32 q5, [r8, #-16] +; CHECK-NEXT: vldrw.u32 q2, [r8, #-12] ; CHECK-NEXT: vfma.f32 q0, q1, r3 -; CHECK-NEXT: vldrw.u32 q1, [r5, #28] +; CHECK-NEXT: vldrw.u32 q3, [r8, #-8] ; CHECK-NEXT: vfma.f32 q0, q6, r4 -; CHECK-NEXT: add.w r8, r5, #32 -; CHECK-NEXT: vfma.f32 q0, q4, r7 -; CHECK-NEXT: adds r6, #32 -; CHECK-NEXT: vfma.f32 q0, q5, r10 -; CHECK-NEXT: vfma.f32 q0, q2, r11 -; CHECK-NEXT: mov r5, r8 -; CHECK-NEXT: vfma.f32 q0, q3, r1 -; CHECK-NEXT: vfma.f32 q0, q1, r9 +; CHECK-NEXT: ldr r1, [r7, #28] +; CHECK-NEXT: vfma.f32 q0, q4, r5 +; CHECK-NEXT: vldrw.u32 q1, [r8, #-4] +; CHECK-NEXT: vfma.f32 q0, q5, r6 +; CHECK-NEXT: adds r7, #32 +; CHECK-NEXT: vfma.f32 q0, q2, r10 +; CHECK-NEXT: vfma.f32 q0, q3, r11 +; CHECK-NEXT: vfma.f32 q0, q1, r1 ; CHECK-NEXT: le lr, .LBB16_6 ; CHECK-NEXT: b .LBB16_8 ; CHECK-NEXT: .LBB16_7: @ in Loop: Header=BB16_4 Depth=1 -; CHECK-NEXT: ldr r6, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: mov r8, r5 +; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: .LBB16_8: @ %for.end ; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: beq .LBB16_3 ; CHECK-NEXT: @ %bb.9: @ %while.body76.preheader ; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 -; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: mov r4, r8 ; CHECK-NEXT: .LBB16_10: @ %while.body76 ; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldr r1, [r6], #4 -; CHECK-NEXT: vldrw.u32 q1, [r0], #4 -; CHECK-NEXT: subs r5, #1 +; CHECK-NEXT: ldr r1, [r7], #4 +; CHECK-NEXT: vldrw.u32 q1, [r4], #4 +; CHECK-NEXT: subs r0, #1 ; CHECK-NEXT: vfma.f32 q0, q1, r1 -; CHECK-NEXT: cmp r5, #1 +; CHECK-NEXT: cmp r0, #1 ; CHECK-NEXT: bgt .LBB16_10 ; CHECK-NEXT: @ %bb.11: @ %while.end.loopexit ; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: add.w r8, r8, r0, lsl #2 ; CHECK-NEXT: b .LBB16_3 ; CHECK-NEXT: .LBB16_12: @ %if.end -; CHECK-NEXT: add sp, #40 +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-ldst-postinc.ll b/llvm/test/CodeGen/Thumb2/mve-ldst-postinc.ll index d84724a0b52e..3d0634c0ca85 100644 --- a/llvm/test/CodeGen/Thumb2/mve-ldst-postinc.ll +++ b/llvm/test/CodeGen/Thumb2/mve-ldst-postinc.ll @@ -992,8 +992,7 @@ define i8* @ldrwi32_align1(i8* %x, i8* %y) { ; ; CHECK-BE-LABEL: ldrwi32_align1: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: vldrb.u8 q0, [r0] -; CHECK-BE-NEXT: adds r0, #3 +; CHECK-BE-NEXT: vldrb.u8 q0, [r0], #3 ; CHECK-BE-NEXT: vrev32.8 q0, q0 ; CHECK-BE-NEXT: vstrw.32 q0, [r1] ; CHECK-BE-NEXT: bx lr @@ -1015,8 +1014,7 @@ define i8* @ldrhi16_align1(i8* %x, i8* %y) { ; ; CHECK-BE-LABEL: ldrhi16_align1: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: vldrb.u8 q0, [r0] -; CHECK-BE-NEXT: adds r0, #3 +; CHECK-BE-NEXT: vldrb.u8 q0, [r0], #3 ; CHECK-BE-NEXT: vrev16.8 q0, q0 ; CHECK-BE-NEXT: vstrh.16 q0, [r1] ; CHECK-BE-NEXT: bx lr @@ -1062,8 +1060,7 @@ define i8* @ldrf32_align1(i8* %x, i8* %y) { ; ; CHECK-BE-LABEL: ldrf32_align1: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: vldrb.u8 q0, [r0] -; CHECK-BE-NEXT: adds r0, #3 +; CHECK-BE-NEXT: vldrb.u8 q0, [r0], #3 ; CHECK-BE-NEXT: vrev32.8 q0, q0 ; CHECK-BE-NEXT: vstrw.32 q0, [r1] ; CHECK-BE-NEXT: bx lr @@ -1085,8 +1082,7 @@ define i8* @ldrf16_align1(i8* %x, i8* %y) { ; ; CHECK-BE-LABEL: ldrf16_align1: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: vldrb.u8 q0, [r0] -; CHECK-BE-NEXT: adds r0, #3 +; CHECK-BE-NEXT: vldrb.u8 q0, [r0], #3 ; CHECK-BE-NEXT: vrev16.8 q0, q0 ; CHECK-BE-NEXT: vstrh.16 q0, [r1] ; CHECK-BE-NEXT: bx lr @@ -1819,8 +1815,7 @@ define i8* @strwi32_align1(i8* %y, i8* %x) { ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vldrw.u32 q0, [r1] ; CHECK-BE-NEXT: vrev32.8 q0, q0 -; CHECK-BE-NEXT: vstrb.8 q0, [r0] -; CHECK-BE-NEXT: adds r0, #3 +; CHECK-BE-NEXT: vstrb.8 q0, [r0], #3 ; CHECK-BE-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 3 @@ -1842,8 +1837,7 @@ define i8* @strhi16_align1(i8* %y, i8* %x) { ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vldrh.u16 q0, [r1] ; CHECK-BE-NEXT: vrev16.8 q0, q0 -; CHECK-BE-NEXT: vstrb.8 q0, [r0] -; CHECK-BE-NEXT: adds r0, #3 +; CHECK-BE-NEXT: vstrb.8 q0, [r0], #3 ; CHECK-BE-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 3 @@ -1889,8 +1883,7 @@ define i8* @strf32_align1(i8* %y, i8* %x) { ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vldrw.u32 q0, [r1] ; CHECK-BE-NEXT: vrev32.8 q0, q0 -; CHECK-BE-NEXT: vstrb.8 q0, [r0] -; CHECK-BE-NEXT: adds r0, #3 +; CHECK-BE-NEXT: vstrb.8 q0, [r0], #3 ; CHECK-BE-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 3 @@ -1912,8 +1905,7 @@ define i8* @strf16_align1(i8* %y, i8* %x) { ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vldrh.u16 q0, [r1] ; CHECK-BE-NEXT: vrev16.8 q0, q0 -; CHECK-BE-NEXT: vstrb.8 q0, [r0] -; CHECK-BE-NEXT: adds r0, #3 +; CHECK-BE-NEXT: vstrb.8 q0, [r0], #3 ; CHECK-BE-NEXT: bx lr entry: %z = getelementptr inbounds i8, i8* %y, i32 3 diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll index ba588166522b..5be3f722a0e4 100644 --- a/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll @@ -17,10 +17,9 @@ define i32 @vaddv(i32* nocapture readonly %data, i32 %N) { ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_2: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r1], #32 ; CHECK-NEXT: vaddva.s32 r0, q0 -; CHECK-NEXT: vldrw.u32 q0, [r1, #16] -; CHECK-NEXT: adds r1, #32 +; CHECK-NEXT: vldrw.u32 q0, [r1, #-16] ; CHECK-NEXT: vaddva.s32 r0, q0 ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup @@ -68,40 +67,36 @@ define void @arm_cmplx_dot_prod_q15(i16* nocapture readonly %pSrcA, i16* nocaptu ; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: lsr.w lr, r7, #3 ; CHECK-NEXT: mov r7, r12 -; CHECK-NEXT: mov r9, r12 +; CHECK-NEXT: mov r11, r12 ; CHECK-NEXT: wls lr, lr, .LBB1_4 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader -; CHECK-NEXT: mov.w r9, #0 +; CHECK-NEXT: mov.w r11, #0 ; CHECK-NEXT: add.w r8, r0, lr, lsl #5 -; CHECK-NEXT: add.w r6, r0, #32 -; CHECK-NEXT: add.w r0, r1, #32 -; CHECK-NEXT: lsl.w r5, lr, #4 -; CHECK-NEXT: mov r4, r9 +; CHECK-NEXT: adds r0, #32 +; CHECK-NEXT: add.w r6, r1, #32 +; CHECK-NEXT: lsl.w r9, lr, #4 +; CHECK-NEXT: mov r4, r11 ; CHECK-NEXT: movs r7, #0 -; CHECK-NEXT: mov r12, r9 +; CHECK-NEXT: mov r12, r11 ; CHECK-NEXT: .LBB1_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] -; CHECK-NEXT: vldrh.u16 q3, [r6, #-16] -; CHECK-NEXT: add.w r10, r6, #32 -; CHECK-NEXT: add.w r11, r0, #32 -; CHECK-NEXT: vmlaldavax.s16 r4, r9, q0, q1 +; CHECK-NEXT: vldrh.u16 q2, [r6, #-16] +; CHECK-NEXT: vldrh.u16 q3, [r0, #-16] +; CHECK-NEXT: vmlaldavax.s16 r4, r11, q0, q1 ; CHECK-NEXT: vmlsldava.s16 r12, r7, q0, q1 -; CHECK-NEXT: vldrh.u16 q0, [r6] -; CHECK-NEXT: vldrh.u16 q1, [r0] -; CHECK-NEXT: vmlaldavax.s16 r4, r9, q3, q2 +; CHECK-NEXT: vldrh.u16 q0, [r0], #32 +; CHECK-NEXT: vldrh.u16 q1, [r6], #32 +; CHECK-NEXT: vmlaldavax.s16 r4, r11, q3, q2 ; CHECK-NEXT: vmlsldava.s16 r12, r7, q3, q2 -; CHECK-NEXT: mov r6, r10 -; CHECK-NEXT: mov r0, r11 ; CHECK-NEXT: le lr, .LBB1_2 ; CHECK-NEXT: @ %bb.3: @ %while.cond.while.end_crit_edge -; CHECK-NEXT: add.w r1, r1, r5, lsl #1 +; CHECK-NEXT: add.w r1, r1, r9, lsl #1 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: .LBB1_4: @ %while.end -; CHECK-NEXT: vmlaldavax.s16 r4, r9, q0, q1 +; CHECK-NEXT: vmlaldavax.s16 r4, r11, q0, q1 ; CHECK-NEXT: vmlsldava.s16 r12, r7, q0, q1 ; CHECK-NEXT: mov r10, r4 -; CHECK-NEXT: mov r5, r9 +; CHECK-NEXT: mov r5, r11 ; CHECK-NEXT: lsrl r10, r5, #6 ; CHECK-NEXT: ldr.w r8, [sp, #36] ; CHECK-NEXT: mov r6, r12 @@ -113,20 +108,20 @@ define void @arm_cmplx_dot_prod_q15(i16* nocapture readonly %pSrcA, i16* nocaptu ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrsh.w r5, [r0, #2] ; CHECK-NEXT: ldrsh.w r6, [r1] -; CHECK-NEXT: ldrsh.w r10, [r0] +; CHECK-NEXT: ldrsh.w r9, [r0] ; CHECK-NEXT: adds r0, #4 ; CHECK-NEXT: ldrsh.w r2, [r1, #2] ; CHECK-NEXT: adds r1, #4 -; CHECK-NEXT: smlalbb r4, r9, r6, r5 -; CHECK-NEXT: smlalbb r12, r7, r6, r10 +; CHECK-NEXT: smlalbb r4, r11, r6, r5 +; CHECK-NEXT: smlalbb r12, r7, r6, r9 ; CHECK-NEXT: muls r5, r2, r5 -; CHECK-NEXT: smlalbb r4, r9, r2, r10 +; CHECK-NEXT: smlalbb r4, r11, r2, r9 ; CHECK-NEXT: subs.w r12, r12, r5 ; CHECK-NEXT: sbc.w r7, r7, r5, asr #31 ; CHECK-NEXT: le lr, .LBB1_5 ; CHECK-NEXT: @ %bb.6: @ %while.end34.loopexit ; CHECK-NEXT: lsrl r12, r7, #6 -; CHECK-NEXT: lsrl r4, r9, #6 +; CHECK-NEXT: lsrl r4, r11, #6 ; CHECK-NEXT: mov r6, r12 ; CHECK-NEXT: mov r10, r4 ; CHECK-NEXT: .LBB1_7: @ %while.end34 @@ -301,14 +296,11 @@ define void @fma8(float* noalias nocapture readonly %A, float* noalias nocapture ; CHECK-NEXT: vldrw.u32 q2, [r6, #16] ; CHECK-NEXT: vldrw.u32 q3, [r6] ; CHECK-NEXT: vfma.f32 q2, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r4] -; CHECK-NEXT: vldrw.u32 q1, [r5] -; CHECK-NEXT: vstrw.32 q2, [r6, #16] -; CHECK-NEXT: adds r4, #32 -; CHECK-NEXT: adds r5, #32 +; CHECK-NEXT: vldrw.u32 q0, [r4], #32 +; CHECK-NEXT: vldrw.u32 q1, [r5], #32 ; CHECK-NEXT: vfma.f32 q3, q1, q0 -; CHECK-NEXT: vstrw.32 q3, [r6] -; CHECK-NEXT: adds r6, #32 +; CHECK-NEXT: vstrw.32 q3, [r6], #32 +; CHECK-NEXT: vstrw.32 q2, [r6, #-16] ; CHECK-NEXT: le lr, .LBB2_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r12, r3 diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.mir b/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.mir index 3b157c4112a2..d69037a952eb 100644 --- a/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.mir +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.mir @@ -51,9 +51,8 @@ body: | ; CHECK-LABEL: name: MVE_VLDRWU32 ; CHECK: liveins: $r0, $q0 ; CHECK: [[COPY:%[0-9]+]]:gprnopc = COPY $r0 - ; CHECK: [[MVE_VLDRWU32_:%[0-9]+]]:mqpr = MVE_VLDRWU32 [[COPY]], 0, 0, $noreg :: (load 16, align 8) - ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = nuw t2ADDri [[COPY]], 32, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r0 = COPY [[t2ADDri]] + ; CHECK: [[MVE_VLDRWU32_post:%[0-9]+]]:rgpr, [[MVE_VLDRWU32_post1:%[0-9]+]]:mqpr = MVE_VLDRWU32_post [[COPY]], 32, 0, $noreg :: (load 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VLDRWU32_post]] ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 %0:gprnopc = COPY $r0 %1:mqpr = MVE_VLDRWU32 %0, 0, 0, $noreg :: (load 16, align 8) @@ -79,9 +78,8 @@ body: | ; CHECK-LABEL: name: MVE_VLDRHU16 ; CHECK: liveins: $r0, $q0 ; CHECK: [[COPY:%[0-9]+]]:gprnopc = COPY $r0 - ; CHECK: [[MVE_VLDRHU16_:%[0-9]+]]:mqpr = MVE_VLDRHU16 [[COPY]], 0, 0, $noreg :: (load 16, align 8) - ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = nuw t2ADDri [[COPY]], 32, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r0 = COPY [[t2ADDri]] + ; CHECK: [[MVE_VLDRHU16_post:%[0-9]+]]:rgpr, [[MVE_VLDRHU16_post1:%[0-9]+]]:mqpr = MVE_VLDRHU16_post [[COPY]], 32, 0, $noreg :: (load 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VLDRHU16_post]] ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 %0:gprnopc = COPY $r0 %1:mqpr = MVE_VLDRHU16 %0, 0, 0, $noreg :: (load 16, align 8) @@ -107,9 +105,8 @@ body: | ; CHECK-LABEL: name: MVE_VLDRBU8 ; CHECK: liveins: $r0, $q0 ; CHECK: [[COPY:%[0-9]+]]:gprnopc = COPY $r0 - ; CHECK: [[MVE_VLDRBU8_:%[0-9]+]]:mqpr = MVE_VLDRBU8 [[COPY]], 0, 0, $noreg :: (load 16, align 8) - ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = nuw t2ADDri [[COPY]], 32, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r0 = COPY [[t2ADDri]] + ; CHECK: [[MVE_VLDRBU8_post:%[0-9]+]]:rgpr, [[MVE_VLDRBU8_post1:%[0-9]+]]:mqpr = MVE_VLDRBU8_post [[COPY]], 32, 0, $noreg :: (load 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VLDRBU8_post]] ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 %0:gprnopc = COPY $r0 %1:mqpr = MVE_VLDRBU8 %0, 0, 0, $noreg :: (load 16, align 8) @@ -135,9 +132,8 @@ body: | ; CHECK-LABEL: name: MVE_VLDRBS32 ; CHECK: liveins: $r0, $q0 ; CHECK: [[COPY:%[0-9]+]]:tgpr = COPY $r0 - ; CHECK: [[MVE_VLDRBS32_:%[0-9]+]]:mqpr = MVE_VLDRBS32 [[COPY]], 0, 0, $noreg :: (load 4, align 8) - ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = nuw t2ADDri [[COPY]], 32, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r0 = COPY [[t2ADDri]] + ; CHECK: [[MVE_VLDRBS32_post:%[0-9]+]]:tgpr, [[MVE_VLDRBS32_post1:%[0-9]+]]:mqpr = MVE_VLDRBS32_post [[COPY]], 32, 0, $noreg :: (load 4, align 8) + ; CHECK: $r0 = COPY [[MVE_VLDRBS32_post]] ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 %0:tgpr = COPY $r0 %1:mqpr = MVE_VLDRBS32 %0, 0, 0, $noreg :: (load 4, align 8) @@ -163,9 +159,8 @@ body: | ; CHECK-LABEL: name: MVE_VLDRBU32 ; CHECK: liveins: $r0, $q0 ; CHECK: [[COPY:%[0-9]+]]:tgpr = COPY $r0 - ; CHECK: [[MVE_VLDRBU32_:%[0-9]+]]:mqpr = MVE_VLDRBU32 [[COPY]], 0, 0, $noreg :: (load 4, align 8) - ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = nuw t2ADDri [[COPY]], 32, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r0 = COPY [[t2ADDri]] + ; CHECK: [[MVE_VLDRBU32_post:%[0-9]+]]:tgpr, [[MVE_VLDRBU32_post1:%[0-9]+]]:mqpr = MVE_VLDRBU32_post [[COPY]], 32, 0, $noreg :: (load 4, align 8) + ; CHECK: $r0 = COPY [[MVE_VLDRBU32_post]] ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 %0:tgpr = COPY $r0 %1:mqpr = MVE_VLDRBU32 %0, 0, 0, $noreg :: (load 4, align 8) @@ -191,9 +186,8 @@ body: | ; CHECK-LABEL: name: MVE_VLDRHS32 ; CHECK: liveins: $r0, $q0 ; CHECK: [[COPY:%[0-9]+]]:tgpr = COPY $r0 - ; CHECK: [[MVE_VLDRHS32_:%[0-9]+]]:mqpr = MVE_VLDRHS32 [[COPY]], 0, 0, $noreg :: (load 8) - ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = nuw t2ADDri [[COPY]], 32, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r0 = COPY [[t2ADDri]] + ; CHECK: [[MVE_VLDRHS32_post:%[0-9]+]]:tgpr, [[MVE_VLDRHS32_post1:%[0-9]+]]:mqpr = MVE_VLDRHS32_post [[COPY]], 32, 0, $noreg :: (load 8) + ; CHECK: $r0 = COPY [[MVE_VLDRHS32_post]] ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 %0:tgpr = COPY $r0 %1:mqpr = MVE_VLDRHS32 %0, 0, 0, $noreg :: (load 8, align 8) @@ -219,9 +213,8 @@ body: | ; CHECK-LABEL: name: MVE_VLDRHU32 ; CHECK: liveins: $r0, $q0 ; CHECK: [[COPY:%[0-9]+]]:tgpr = COPY $r0 - ; CHECK: [[MVE_VLDRHU32_:%[0-9]+]]:mqpr = MVE_VLDRHU32 [[COPY]], 0, 0, $noreg :: (load 8) - ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = nuw t2ADDri [[COPY]], 32, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r0 = COPY [[t2ADDri]] + ; CHECK: [[MVE_VLDRHU32_post:%[0-9]+]]:tgpr, [[MVE_VLDRHU32_post1:%[0-9]+]]:mqpr = MVE_VLDRHU32_post [[COPY]], 32, 0, $noreg :: (load 8) + ; CHECK: $r0 = COPY [[MVE_VLDRHU32_post]] ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 %0:tgpr = COPY $r0 %1:mqpr = MVE_VLDRHU32 %0, 0, 0, $noreg :: (load 8, align 8) @@ -247,9 +240,8 @@ body: | ; CHECK-LABEL: name: MVE_VLDRBS16 ; CHECK: liveins: $r0, $q0 ; CHECK: [[COPY:%[0-9]+]]:tgpr = COPY $r0 - ; CHECK: [[MVE_VLDRBS16_:%[0-9]+]]:mqpr = MVE_VLDRBS16 [[COPY]], 0, 0, $noreg :: (load 8) - ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = nuw t2ADDri [[COPY]], 32, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r0 = COPY [[t2ADDri]] + ; CHECK: [[MVE_VLDRBS16_post:%[0-9]+]]:tgpr, [[MVE_VLDRBS16_post1:%[0-9]+]]:mqpr = MVE_VLDRBS16_post [[COPY]], 32, 0, $noreg :: (load 8) + ; CHECK: $r0 = COPY [[MVE_VLDRBS16_post]] ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 %0:tgpr = COPY $r0 %1:mqpr = MVE_VLDRBS16 %0, 0, 0, $noreg :: (load 8, align 8) @@ -275,9 +267,8 @@ body: | ; CHECK-LABEL: name: MVE_VLDRBU16 ; CHECK: liveins: $r0, $q0 ; CHECK: [[COPY:%[0-9]+]]:tgpr = COPY $r0 - ; CHECK: [[MVE_VLDRBU16_:%[0-9]+]]:mqpr = MVE_VLDRBU16 [[COPY]], 0, 0, $noreg :: (load 8) - ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = nuw t2ADDri [[COPY]], 32, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r0 = COPY [[t2ADDri]] + ; CHECK: [[MVE_VLDRBU16_post:%[0-9]+]]:tgpr, [[MVE_VLDRBU16_post1:%[0-9]+]]:mqpr = MVE_VLDRBU16_post [[COPY]], 32, 0, $noreg :: (load 8) + ; CHECK: $r0 = COPY [[MVE_VLDRBU16_post]] ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 %0:tgpr = COPY $r0 %1:mqpr = MVE_VLDRBU16 %0, 0, 0, $noreg :: (load 8, align 8) @@ -303,10 +294,9 @@ body: | ; CHECK-LABEL: name: MVE_VSTRWU32 ; CHECK: liveins: $r0, $q0 ; CHECK: [[COPY:%[0-9]+]]:mqpr = COPY $q0 - ; CHECK: [[COPY1:%[0-9]+]]:gprnopc = COPY $r0 - ; CHECK: MVE_VSTRWU32 [[COPY]], [[COPY1]], 0, 0, $noreg :: (store 16, align 8) - ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = nuw t2ADDri [[COPY1]], 32, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r0 = COPY [[t2ADDri]] + ; CHECK: [[COPY1:%[0-9]+]]:rgpr = COPY $r0 + ; CHECK: [[MVE_VSTRWU32_post:%[0-9]+]]:rgpr = MVE_VSTRWU32_post [[COPY]], [[COPY1]], 32, 0, $noreg :: (store 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VSTRWU32_post]] ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 %1:mqpr = COPY $q0 %0:gprnopc = COPY $r0 @@ -333,10 +323,9 @@ body: | ; CHECK-LABEL: name: MVE_VSTRHU16 ; CHECK: liveins: $r0, $q0 ; CHECK: [[COPY:%[0-9]+]]:mqpr = COPY $q0 - ; CHECK: [[COPY1:%[0-9]+]]:gprnopc = COPY $r0 - ; CHECK: MVE_VSTRHU16 [[COPY]], [[COPY1]], 0, 0, $noreg :: (store 16, align 8) - ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = nuw t2ADDri [[COPY1]], 32, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r0 = COPY [[t2ADDri]] + ; CHECK: [[COPY1:%[0-9]+]]:rgpr = COPY $r0 + ; CHECK: [[MVE_VSTRHU16_post:%[0-9]+]]:rgpr = MVE_VSTRHU16_post [[COPY]], [[COPY1]], 32, 0, $noreg :: (store 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VSTRHU16_post]] ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 %1:mqpr = COPY $q0 %0:gprnopc = COPY $r0 @@ -363,10 +352,9 @@ body: | ; CHECK-LABEL: name: MVE_VSTRBU8 ; CHECK: liveins: $r0, $q0 ; CHECK: [[COPY:%[0-9]+]]:mqpr = COPY $q0 - ; CHECK: [[COPY1:%[0-9]+]]:gprnopc = COPY $r0 - ; CHECK: MVE_VSTRBU8 [[COPY]], [[COPY1]], 0, 0, $noreg :: (store 16, align 8) - ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = nuw t2ADDri [[COPY1]], 32, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r0 = COPY [[t2ADDri]] + ; CHECK: [[COPY1:%[0-9]+]]:rgpr = COPY $r0 + ; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post [[COPY]], [[COPY1]], 32, 0, $noreg :: (store 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VSTRBU8_post]] ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 %1:mqpr = COPY $q0 %0:gprnopc = COPY $r0 @@ -394,9 +382,8 @@ body: | ; CHECK: liveins: $r0, $q0 ; CHECK: [[COPY:%[0-9]+]]:mqpr = COPY $q0 ; CHECK: [[COPY1:%[0-9]+]]:tgpr = COPY $r0 - ; CHECK: MVE_VSTRH32 [[COPY]], [[COPY1]], 0, 0, $noreg :: (store 8) - ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = nuw t2ADDri [[COPY1]], 32, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r0 = COPY [[t2ADDri]] + ; CHECK: [[MVE_VSTRH32_post:%[0-9]+]]:tgpr = MVE_VSTRH32_post [[COPY]], [[COPY1]], 32, 0, $noreg :: (store 8) + ; CHECK: $r0 = COPY [[MVE_VSTRH32_post]] ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 %1:mqpr = COPY $q0 %0:tgpr = COPY $r0 @@ -424,9 +411,8 @@ body: | ; CHECK: liveins: $r0, $q0 ; CHECK: [[COPY:%[0-9]+]]:mqpr = COPY $q0 ; CHECK: [[COPY1:%[0-9]+]]:tgpr = COPY $r0 - ; CHECK: MVE_VSTRB32 [[COPY]], [[COPY1]], 0, 0, $noreg :: (store 4, align 8) - ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = nuw t2ADDri [[COPY1]], 32, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r0 = COPY [[t2ADDri]] + ; CHECK: [[MVE_VSTRB32_post:%[0-9]+]]:tgpr = MVE_VSTRB32_post [[COPY]], [[COPY1]], 32, 0, $noreg :: (store 4, align 8) + ; CHECK: $r0 = COPY [[MVE_VSTRB32_post]] ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 %1:mqpr = COPY $q0 %0:tgpr = COPY $r0 @@ -454,9 +440,8 @@ body: | ; CHECK: liveins: $r0, $q0 ; CHECK: [[COPY:%[0-9]+]]:mqpr = COPY $q0 ; CHECK: [[COPY1:%[0-9]+]]:tgpr = COPY $r0 - ; CHECK: MVE_VSTRB16 [[COPY]], [[COPY1]], 0, 0, $noreg :: (store 8) - ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = nuw t2ADDri [[COPY1]], 32, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r0 = COPY [[t2ADDri]] + ; CHECK: [[MVE_VSTRB16_post:%[0-9]+]]:tgpr = MVE_VSTRB16_post [[COPY]], [[COPY1]], 32, 0, $noreg :: (store 8) + ; CHECK: $r0 = COPY [[MVE_VSTRB16_post]] ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 %1:mqpr = COPY $q0 %0:tgpr = COPY $r0 @@ -484,10 +469,9 @@ body: | ; CHECK-LABEL: name: ld0ld4 ; CHECK: liveins: $r0, $q0 ; CHECK: [[COPY:%[0-9]+]]:gprnopc = COPY $r0 - ; CHECK: [[MVE_VLDRWU32_:%[0-9]+]]:mqpr = MVE_VLDRWU32 [[COPY]], 0, 0, $noreg :: (load 16, align 8) - ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = nuw t2ADDri [[COPY]], 32, 14 /* CC::al */, $noreg, $noreg - ; CHECK: [[MVE_VLDRWU32_1:%[0-9]+]]:mqpr = MVE_VLDRWU32 [[COPY]], 4, 0, $noreg :: (load 16, align 8) - ; CHECK: $r0 = COPY [[t2ADDri]] + ; CHECK: [[MVE_VLDRWU32_post:%[0-9]+]]:rgpr, [[MVE_VLDRWU32_post1:%[0-9]+]]:mqpr = MVE_VLDRWU32_post [[COPY]], 32, 0, $noreg :: (load 16, align 8) + ; CHECK: [[MVE_VLDRWU32_:%[0-9]+]]:mqpr = MVE_VLDRWU32 [[MVE_VLDRWU32_post]], -28, 0, $noreg :: (load 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VLDRWU32_post]] ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 %0:gprnopc = COPY $r0 %1:mqpr = MVE_VLDRWU32 %0, 0, 0, $noreg :: (load 16, align 8) @@ -516,9 +500,8 @@ body: | ; CHECK: liveins: $r0, $q0 ; CHECK: [[COPY:%[0-9]+]]:gprnopc = COPY $r0 ; CHECK: [[MVE_VLDRWU32_:%[0-9]+]]:mqpr = MVE_VLDRWU32 [[COPY]], 4, 0, $noreg :: (load 16, align 8) - ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = nuw t2ADDri [[COPY]], 32, 14 /* CC::al */, $noreg, $noreg - ; CHECK: [[MVE_VLDRWU32_1:%[0-9]+]]:mqpr = MVE_VLDRWU32 [[COPY]], 0, 0, $noreg :: (load 16, align 8) - ; CHECK: $r0 = COPY [[t2ADDri]] + ; CHECK: [[MVE_VLDRWU32_post:%[0-9]+]]:rgpr, [[MVE_VLDRWU32_post1:%[0-9]+]]:mqpr = MVE_VLDRWU32_post [[COPY]], 32, 0, $noreg :: (load 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VLDRWU32_post]] ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 %0:gprnopc = COPY $r0 %1:mqpr = MVE_VLDRWU32 %0, 4, 0, $noreg :: (load 16, align 8) @@ -548,10 +531,9 @@ body: | ; CHECK: liveins: $r0, $q0 ; CHECK: [[COPY:%[0-9]+]]:gprnopc = COPY $r0 ; CHECK: [[MVE_VLDRWU32_:%[0-9]+]]:mqpr = MVE_VLDRWU32 [[COPY]], 0, 0, $noreg :: (load 16, align 8) - ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = nuw t2ADDri [[COPY]], 32, 14 /* CC::al */, $noreg, $noreg ; CHECK: [[MVE_VLDRWU32_1:%[0-9]+]]:mqpr = MVE_VLDRWU32 [[COPY]], 4, 0, $noreg :: (load 16, align 8) - ; CHECK: [[MVE_VLDRWU32_2:%[0-9]+]]:mqpr = MVE_VLDRWU32 [[COPY]], 0, 0, $noreg :: (load 16, align 8) - ; CHECK: $r0 = COPY [[t2ADDri]] + ; CHECK: [[MVE_VLDRWU32_post:%[0-9]+]]:rgpr, [[MVE_VLDRWU32_post1:%[0-9]+]]:mqpr = MVE_VLDRWU32_post [[COPY]], 32, 0, $noreg :: (load 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VLDRWU32_post]] ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 %0:gprnopc = COPY $r0 %1:mqpr = MVE_VLDRWU32 %0, 0, 0, $noreg :: (load 16, align 8) @@ -582,10 +564,9 @@ body: | ; CHECK: liveins: $r0, $q0 ; CHECK: [[COPY:%[0-9]+]]:gprnopc = COPY $r0 ; CHECK: [[MVE_VLDRWU32_:%[0-9]+]]:mqpr = MVE_VLDRWU32 [[COPY]], 4, 0, $noreg :: (load 16, align 8) - ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = nuw t2ADDri [[COPY]], 32, 14 /* CC::al */, $noreg, $noreg - ; CHECK: [[MVE_VLDRWU32_1:%[0-9]+]]:mqpr = MVE_VLDRWU32 [[COPY]], 0, 0, $noreg :: (load 16, align 8) - ; CHECK: [[MVE_VLDRWU32_2:%[0-9]+]]:mqpr = MVE_VLDRWU32 [[COPY]], 4, 0, $noreg :: (load 16, align 8) - ; CHECK: $r0 = COPY [[t2ADDri]] + ; CHECK: [[MVE_VLDRWU32_post:%[0-9]+]]:rgpr, [[MVE_VLDRWU32_post1:%[0-9]+]]:mqpr = MVE_VLDRWU32_post [[COPY]], 32, 0, $noreg :: (load 16, align 8) + ; CHECK: [[MVE_VLDRWU32_1:%[0-9]+]]:mqpr = MVE_VLDRWU32 [[MVE_VLDRWU32_post]], -28, 0, $noreg :: (load 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VLDRWU32_post]] ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 %0:gprnopc = COPY $r0 %1:mqpr = MVE_VLDRWU32 %0, 4, 0, $noreg :: (load 16, align 8) @@ -614,10 +595,9 @@ body: | ; CHECK-LABEL: name: addload ; CHECK: liveins: $r0, $q0 ; CHECK: [[COPY:%[0-9]+]]:gprnopc = COPY $r0 - ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = nuw t2ADDri [[COPY]], 32, 14 /* CC::al */, $noreg, $noreg - ; CHECK: [[MVE_VLDRWU32_:%[0-9]+]]:mqpr = MVE_VLDRWU32 [[COPY]], 0, 0, $noreg :: (load 16, align 8) - ; CHECK: [[MVE_VLDRWU32_1:%[0-9]+]]:mqpr = MVE_VLDRWU32 [[COPY]], 4, 0, $noreg :: (load 16, align 8) - ; CHECK: $r0 = COPY [[t2ADDri]] + ; CHECK: [[MVE_VLDRWU32_post:%[0-9]+]]:rgpr, [[MVE_VLDRWU32_post1:%[0-9]+]]:mqpr = MVE_VLDRWU32_post [[COPY]], 32, 0, $noreg :: (load 16, align 8) + ; CHECK: [[MVE_VLDRWU32_:%[0-9]+]]:mqpr = MVE_VLDRWU32 [[MVE_VLDRWU32_post]], -28, 0, $noreg :: (load 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VLDRWU32_post]] ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 %0:gprnopc = COPY $r0 %2:rgpr = nuw t2ADDri %0, 32, 14, $noreg, $noreg @@ -645,10 +625,9 @@ body: | ; CHECK-LABEL: name: sub ; CHECK: liveins: $r0, $q0 ; CHECK: [[COPY:%[0-9]+]]:gprnopc = COPY $r0 - ; CHECK: [[MVE_VLDRWU32_:%[0-9]+]]:mqpr = MVE_VLDRWU32 [[COPY]], 0, 0, $noreg :: (load 16, align 8) - ; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = nuw t2SUBri [[COPY]], 32, 14 /* CC::al */, $noreg, $noreg - ; CHECK: [[MVE_VLDRWU32_1:%[0-9]+]]:mqpr = MVE_VLDRWU32 [[COPY]], 4, 0, $noreg :: (load 16, align 8) - ; CHECK: $r0 = COPY [[t2SUBri]] + ; CHECK: [[MVE_VLDRWU32_post:%[0-9]+]]:rgpr, [[MVE_VLDRWU32_post1:%[0-9]+]]:mqpr = MVE_VLDRWU32_post [[COPY]], -32, 0, $noreg :: (load 16, align 8) + ; CHECK: [[MVE_VLDRWU32_:%[0-9]+]]:mqpr = MVE_VLDRWU32 [[MVE_VLDRWU32_post]], 36, 0, $noreg :: (load 16, align 8) + ; CHECK: $r0 = COPY [[MVE_VLDRWU32_post]] ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 %0:gprnopc = COPY $r0 %1:mqpr = MVE_VLDRWU32 %0, 0, 0, $noreg :: (load 16, align 8) @@ -804,10 +783,9 @@ body: | ; CHECK-LABEL: name: addUseOK ; CHECK: liveins: $r0, $q0 ; CHECK: [[COPY:%[0-9]+]]:gprnopc = COPY $r0 - ; CHECK: [[MVE_VLDRWU32_:%[0-9]+]]:mqpr = MVE_VLDRWU32 [[COPY]], 0, 0, $noreg :: (load 16, align 8) - ; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = nuw t2SUBri [[COPY]], 32, 14 /* CC::al */, $noreg, $noreg - ; CHECK: [[MVE_VLDRWU32_1:%[0-9]+]]:mqpr = MVE_VLDRWU32 [[COPY]], 4, 0, $noreg :: (load 16, align 8) - ; CHECK: [[t2LSRri:%[0-9]+]]:rgpr = nuw t2LSRri [[t2SUBri]], 2, 14 /* CC::al */, $noreg, $noreg + ; CHECK: [[MVE_VLDRWU32_post:%[0-9]+]]:rgpr, [[MVE_VLDRWU32_post1:%[0-9]+]]:mqpr = MVE_VLDRWU32_post [[COPY]], -32, 0, $noreg :: (load 16, align 8) + ; CHECK: [[MVE_VLDRWU32_:%[0-9]+]]:mqpr = MVE_VLDRWU32 [[MVE_VLDRWU32_post]], 36, 0, $noreg :: (load 16, align 8) + ; CHECK: [[t2LSRri:%[0-9]+]]:rgpr = nuw t2LSRri [[MVE_VLDRWU32_post]], 2, 14 /* CC::al */, $noreg, $noreg ; CHECK: $r0 = COPY [[t2LSRri]] ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 %0:gprnopc = COPY $r0 @@ -872,10 +850,9 @@ body: | ; CHECK-LABEL: name: addUseKilled ; CHECK: liveins: $r0, $q0 ; CHECK: [[COPY:%[0-9]+]]:gprnopc = COPY $r0 - ; CHECK: [[MVE_VLDRWU32_:%[0-9]+]]:mqpr = MVE_VLDRWU32 [[COPY]], 0, 0, $noreg :: (load 16, align 8) - ; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = nuw t2SUBri [[COPY]], 32, 14 /* CC::al */, $noreg, $noreg - ; CHECK: [[t2LSRri:%[0-9]+]]:rgpr = nuw t2LSRri killed [[t2SUBri]], 2, 14 /* CC::al */, $noreg, $noreg - ; CHECK: [[MVE_VLDRWU32_1:%[0-9]+]]:mqpr = MVE_VLDRWU32 [[COPY]], 4, 0, $noreg :: (load 16, align 8) + ; CHECK: [[MVE_VLDRWU32_post:%[0-9]+]]:rgpr, [[MVE_VLDRWU32_post1:%[0-9]+]]:mqpr = MVE_VLDRWU32_post [[COPY]], -32, 0, $noreg :: (load 16, align 8) + ; CHECK: [[t2LSRri:%[0-9]+]]:rgpr = nuw t2LSRri [[MVE_VLDRWU32_post]], 2, 14 /* CC::al */, $noreg, $noreg + ; CHECK: [[MVE_VLDRWU32_:%[0-9]+]]:mqpr = MVE_VLDRWU32 [[MVE_VLDRWU32_post]], 36, 0, $noreg :: (load 16, align 8) ; CHECK: $r0 = COPY [[t2LSRri]] ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 %0:gprnopc = COPY $r0 diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll index 4ece7d5c0b35..d01ef0095a60 100644 --- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll @@ -1278,20 +1278,17 @@ define arm_aapcs_vfpcc void @ssatmul_8_q15(i16* nocapture readonly %pSrcA, i16* ; CHECK-NEXT: vldrh.s32 q2, [r0, #8] ; CHECK-NEXT: vldrh.s32 q3, [r1, #8] ; CHECK-NEXT: vmul.i32 q2, q3, q2 -; CHECK-NEXT: vldrh.s32 q3, [r1] +; CHECK-NEXT: vldrh.s32 q3, [r1], #16 ; CHECK-NEXT: vshr.s32 q2, q2, #15 -; CHECK-NEXT: adds r1, #16 ; CHECK-NEXT: vmax.s32 q2, q2, q0 ; CHECK-NEXT: vmin.s32 q2, q2, q1 ; CHECK-NEXT: vstrh.32 q2, [r2, #8] -; CHECK-NEXT: vldrh.s32 q2, [r0] -; CHECK-NEXT: adds r0, #16 +; CHECK-NEXT: vldrh.s32 q2, [r0], #16 ; CHECK-NEXT: vmul.i32 q2, q3, q2 ; CHECK-NEXT: vshr.s32 q2, q2, #15 ; CHECK-NEXT: vmax.s32 q2, q2, q0 ; CHECK-NEXT: vmin.s32 q2, q2, q1 -; CHECK-NEXT: vstrh.32 q2, [r2] -; CHECK-NEXT: adds r2, #16 +; CHECK-NEXT: vstrh.32 q2, [r2], #16 ; CHECK-NEXT: le lr, .LBB6_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r5, r3 @@ -2142,18 +2139,15 @@ define arm_aapcs_vfpcc void @usatmul_8_q15(i16* nocapture readonly %pSrcA, i16* ; CHECK-NEXT: vldrh.u32 q1, [r0, #8] ; CHECK-NEXT: vldrh.u32 q2, [r1, #8] ; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vldrh.u32 q2, [r1] +; CHECK-NEXT: vldrh.u32 q2, [r1], #16 ; CHECK-NEXT: vshr.u32 q1, q1, #15 -; CHECK-NEXT: adds r1, #16 ; CHECK-NEXT: vmin.u32 q1, q1, q0 ; CHECK-NEXT: vstrh.32 q1, [r2, #8] -; CHECK-NEXT: vldrh.u32 q1, [r0] -; CHECK-NEXT: adds r0, #16 +; CHECK-NEXT: vldrh.u32 q1, [r0], #16 ; CHECK-NEXT: vmul.i32 q1, q2, q1 ; CHECK-NEXT: vshr.u32 q1, q1, #15 ; CHECK-NEXT: vmin.u32 q1, q1, q0 -; CHECK-NEXT: vstrh.32 q1, [r2] -; CHECK-NEXT: adds r2, #16 +; CHECK-NEXT: vstrh.32 q1, [r2], #16 ; CHECK-NEXT: le lr, .LBB12_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r5, r3 @@ -2564,20 +2558,17 @@ define arm_aapcs_vfpcc void @ssatmul_16_q7(i8* nocapture readonly %pSrcA, i8* no ; CHECK-NEXT: vldrb.s16 q2, [r0, #8] ; CHECK-NEXT: vldrb.s16 q3, [r1, #8] ; CHECK-NEXT: vmul.i16 q2, q3, q2 -; CHECK-NEXT: vldrb.s16 q3, [r1] +; CHECK-NEXT: vldrb.s16 q3, [r1], #16 ; CHECK-NEXT: vshr.s16 q2, q2, #7 -; CHECK-NEXT: adds r1, #16 ; CHECK-NEXT: vmax.s16 q2, q2, q0 ; CHECK-NEXT: vmin.s16 q2, q2, q1 ; CHECK-NEXT: vstrb.16 q2, [r2, #8] -; CHECK-NEXT: vldrb.s16 q2, [r0] -; CHECK-NEXT: adds r0, #16 +; CHECK-NEXT: vldrb.s16 q2, [r0], #16 ; CHECK-NEXT: vmul.i16 q2, q3, q2 ; CHECK-NEXT: vshr.s16 q2, q2, #7 ; CHECK-NEXT: vmax.s16 q2, q2, q0 ; CHECK-NEXT: vmin.s16 q2, q2, q1 -; CHECK-NEXT: vstrb.16 q2, [r2] -; CHECK-NEXT: adds r2, #16 +; CHECK-NEXT: vstrb.16 q2, [r2], #16 ; CHECK-NEXT: le lr, .LBB15_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r5, r3 @@ -3667,18 +3658,15 @@ define arm_aapcs_vfpcc void @usatmul_16_q7(i8* nocapture readonly %pSrcA, i8* no ; CHECK-NEXT: vldrb.u16 q1, [r0, #8] ; CHECK-NEXT: vldrb.u16 q2, [r1, #8] ; CHECK-NEXT: vmul.i16 q1, q2, q1 -; CHECK-NEXT: vldrb.u16 q2, [r1] +; CHECK-NEXT: vldrb.u16 q2, [r1], #16 ; CHECK-NEXT: vshr.u16 q1, q1, #7 -; CHECK-NEXT: adds r1, #16 ; CHECK-NEXT: vmin.u16 q1, q1, q0 ; CHECK-NEXT: vstrb.16 q1, [r2, #8] -; CHECK-NEXT: vldrb.u16 q1, [r0] -; CHECK-NEXT: adds r0, #16 +; CHECK-NEXT: vldrb.u16 q1, [r0], #16 ; CHECK-NEXT: vmul.i16 q1, q2, q1 ; CHECK-NEXT: vshr.u16 q1, q1, #7 ; CHECK-NEXT: vmin.u16 q1, q1, q0 -; CHECK-NEXT: vstrb.16 q1, [r2] -; CHECK-NEXT: adds r2, #16 +; CHECK-NEXT: vstrb.16 q1, [r2], #16 ; CHECK-NEXT: le lr, .LBB21_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r5, r3 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll b/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll index 5a283e46432e..11bf7b5900c7 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll @@ -68,9 +68,8 @@ define <4 x i64> *@vld2_v2i64(<4 x i64> *%src, <2 x i64> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: adds r0, #32 +; CHECK-NEXT: vldrw.u32 q0, [r0], #32 ; CHECK-NEXT: vmov.f64 d2, d1 ; CHECK-NEXT: vmov.f32 s5, s3 ; CHECK-NEXT: vmov.f32 s6, s10 @@ -151,8 +150,7 @@ define <4 x double> *@vld2_v2f64(<4 x double> *%src, <2 x double> *%dst) { ; CHECK-LABEL: vld2_v2f64: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: adds r0, #32 +; CHECK-NEXT: vldrw.u32 q1, [r0], #32 ; CHECK-NEXT: vadd.f64 d1, d0, d1 ; CHECK-NEXT: vadd.f64 d0, d2, d3 ; CHECK-NEXT: vstrw.32 q0, [r1] diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll index 7261842cb4a5..7a9d20c5c952 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll @@ -108,12 +108,12 @@ define <8 x i64> *@vld4_v2i64(<8 x i64> *%src, <2 x i64> *%dst) { ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vldrw.u32 q5, [r0, #48] -; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vldrw.u32 q0, [r0], #64 +; CHECK-NEXT: vldrw.u32 q3, [r0, #-48] +; CHECK-NEXT: vldrw.u32 q5, [r0, #-16] +; CHECK-NEXT: vmov.f64 d2, d1 ; CHECK-NEXT: vmov.f64 d8, d7 -; CHECK-NEXT: adds r0, #64 ; CHECK-NEXT: vmov.f32 s17, s15 ; CHECK-NEXT: vmov.f32 s18, s22 ; CHECK-NEXT: vmov.f32 s14, s20 @@ -121,7 +121,6 @@ define <8 x i64> *@vld4_v2i64(<8 x i64> *%src, <2 x i64> *%dst) { ; CHECK-NEXT: vmov.f32 s15, s21 ; CHECK-NEXT: vmov r2, s18 ; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: vmov.f64 d2, d1 ; CHECK-NEXT: vmov.f32 s5, s3 ; CHECK-NEXT: vmov.f32 s6, s10 ; CHECK-NEXT: vmov.f32 s2, s8 @@ -246,11 +245,10 @@ define <8 x double> *@vld4_v2f64(<8 x double> *%src, <2 x double> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] ; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: vadd.f64 d0, d0, d1 ; CHECK-NEXT: vadd.f64 d1, d2, d3 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: vldrw.u32 q2, [r0], #64 ; CHECK-NEXT: vadd.f64 d2, d2, d3 ; CHECK-NEXT: vadd.f64 d3, d4, d5 ; CHECK-NEXT: vadd.f64 d1, d1, d0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll index 3ebf113a5c9f..a1b47029e4aa 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll @@ -29,8 +29,7 @@ define void @vldst4(half* nocapture readonly %pIn, half* nocapture %pOut, i32 %n ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q6, [r0, #32] ; CHECK-NEXT: vldrh.u16 q4, [r0, #48] -; CHECK-NEXT: vldrh.u16 q0, [r0] -; CHECK-NEXT: vldrh.u16 q7, [r0, #16] +; CHECK-NEXT: vldrh.u16 q0, [r0], #64 ; CHECK-NEXT: vmov r3, s24 ; CHECK-NEXT: vmovx.f16 s12, s16 ; CHECK-NEXT: vmov.16 q1[4], r3 @@ -39,6 +38,7 @@ define void @vldst4(half* nocapture readonly %pIn, half* nocapture %pOut, i32 %n ; CHECK-NEXT: vmov r2, s16 ; CHECK-NEXT: vmov.16 q1[6], r2 ; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vldrh.u16 q7, [r0, #-48] ; CHECK-NEXT: vmov.16 q2[0], r2 ; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: vmov.16 q2[1], r3 @@ -48,7 +48,6 @@ define void @vldst4(half* nocapture readonly %pIn, half* nocapture %pOut, i32 %n ; CHECK-NEXT: vmov.16 q1[7], r2 ; CHECK-NEXT: vmov r2, s30 ; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: adds r0, #64 ; CHECK-NEXT: vmov.f32 s10, s6 ; CHECK-NEXT: vmov.f32 s11, s7 ; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload @@ -221,15 +220,14 @@ define void @vldst4(half* nocapture readonly %pIn, half* nocapture %pOut, i32 %n ; CHECK-NEXT: vmov.f32 s31, s11 ; CHECK-NEXT: vmov q2, q4 ; CHECK-NEXT: vmov.f32 s25, s21 -; CHECK-NEXT: vstrh.16 q7, [r1, #16] ; CHECK-NEXT: vmov.f32 s9, s5 -; CHECK-NEXT: vmov.f32 s27, s23 ; CHECK-NEXT: vmov.f32 s3, s15 -; CHECK-NEXT: vstrh.16 q6, [r1] ; CHECK-NEXT: vmov.f32 s11, s7 ; CHECK-NEXT: vstrh.16 q0, [r1, #32] +; CHECK-NEXT: vmov.f32 s27, s23 ; CHECK-NEXT: vstrh.16 q2, [r1, #48] -; CHECK-NEXT: adds r1, #64 +; CHECK-NEXT: vstrh.16 q6, [r1], #64 +; CHECK-NEXT: vstrh.16 q7, [r1, #-48] ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: .LBB0_3: @ %while.end ; CHECK-NEXT: add sp, #104 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll b/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll index e461deab5276..e6bbf4d7e775 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll @@ -146,12 +146,12 @@ define <4 x double> *@vst2_v2f64(<2 x double> *%src, <4 x double> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: add.w r0, r1, #32 ; CHECK-NEXT: vmov.f64 d4, d2 ; CHECK-NEXT: vmov.f64 d5, d0 ; CHECK-NEXT: vmov.f64 d0, d3 -; CHECK-NEXT: vstrw.32 q2, [r1] ; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vstrw.32 q2, [r1], #32 +; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <2 x double>, <2 x double>* %src, i32 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll b/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll index 0378f84834b1..d8f674ac9c19 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll @@ -109,12 +109,10 @@ define <8 x i64> *@vst4_v2i64(<2 x i64> *%src, <8 x i64> *%dst) { ; CHECK-NEXT: vldrw.u32 q3, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vmov.f64 d4, d8 -; CHECK-NEXT: add.w r0, r1, #64 ; CHECK-NEXT: vmov.f32 s9, s17 ; CHECK-NEXT: vmov.f32 s10, s0 ; CHECK-NEXT: vmov.f32 s11, s1 ; CHECK-NEXT: vmov.f32 s0, s18 -; CHECK-NEXT: vstrw.32 q2, [r1] ; CHECK-NEXT: vmov.f32 s1, s19 ; CHECK-NEXT: vmov.f64 d8, d6 ; CHECK-NEXT: vstrw.32 q0, [r1, #32] @@ -125,6 +123,8 @@ define <8 x i64> *@vst4_v2i64(<2 x i64> *%src, <8 x i64> *%dst) { ; CHECK-NEXT: vstrw.32 q4, [r1, #16] ; CHECK-NEXT: vmov.f32 s5, s15 ; CHECK-NEXT: vstrw.32 q1, [r1, #48] +; CHECK-NEXT: vstrw.32 q2, [r1], #64 +; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: @@ -218,18 +218,18 @@ define <8 x double> *@vst4_v2f64(<2 x double> *%src, <8 x double> *%dst) { ; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vmov.f64 d4, d6 -; CHECK-NEXT: add.w r0, r1, #64 -; CHECK-NEXT: vmov.f64 d5, d0 +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vmov.f64 d2, d6 +; CHECK-NEXT: vmov.f64 d3, d0 ; CHECK-NEXT: vmov.f64 d0, d7 -; CHECK-NEXT: vstrw.32 q2, [r1] ; CHECK-NEXT: vmov.f64 d6, d8 ; CHECK-NEXT: vstrw.32 q0, [r1, #32] -; CHECK-NEXT: vmov.f64 d7, d2 -; CHECK-NEXT: vmov.f64 d2, d9 +; CHECK-NEXT: vmov.f64 d7, d4 +; CHECK-NEXT: vmov.f64 d4, d9 ; CHECK-NEXT: vstrw.32 q3, [r1, #16] -; CHECK-NEXT: vstrw.32 q1, [r1, #48] +; CHECK-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-NEXT: vstrw.32 q1, [r1], #64 +; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: