diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp index 755c2e5eb666..7acb70c5e7f5 100644 --- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -1298,6 +1298,12 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) { E = ++MachineBasicBlock::iterator(Divergent->MI); I != E; ++I) RemovePredicate(&*I); + // Check if the instruction defining vpr is a vcmp so it can be combined + // with the VPST This should be the divergent instruction + MachineInstr *VCMP = VCMPOpcodeToVPT(Divergent->MI->getOpcode()) != 0 + ? Divergent->MI + : nullptr; + unsigned Size = 0; auto E = MachineBasicBlock::reverse_iterator(Divergent->MI); auto I = MachineBasicBlock::reverse_iterator(Insts.back().MI); @@ -1307,13 +1313,32 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) { ++Size; ++I; } - // Create a VPST (with a null mask for now, we'll recompute it later). - MachineInstrBuilder MIB = BuildMI(*InsertAt->getParent(), InsertAt, - InsertAt->getDebugLoc(), - TII->get(ARM::MVE_VPST)); - MIB.addImm(0); - LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getPredicateThen()); - LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB); + MachineInstrBuilder MIB; + LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " + << *Block.getPredicateThen()); + if (VCMP) { + // Combine the VPST and VCMP into a VPT + MIB = + BuildMI(*InsertAt->getParent(), InsertAt, InsertAt->getDebugLoc(), + TII->get(VCMPOpcodeToVPT(VCMP->getOpcode()))); + MIB.addImm(ARMVCC::Then); + // Register one + MIB.add(VCMP->getOperand(1)); + // Register two + MIB.add(VCMP->getOperand(2)); + // The comparison code, e.g. ge, eq, lt + MIB.add(VCMP->getOperand(3)); + LLVM_DEBUG(dbgs() + << "ARM Loops: Combining with VCMP to VPT: " << *MIB); + LoLoop.ToRemove.insert(VCMP); + } else { + // Create a VPST (with a null mask for now, we'll recompute it later) + // or a VPT in case there was a VCMP right before it + MIB = BuildMI(*InsertAt->getParent(), InsertAt, + InsertAt->getDebugLoc(), TII->get(ARM::MVE_VPST)); + MIB.addImm(0); + LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB); + } LoLoop.ToRemove.insert(Block.getPredicateThen()); LoLoop.BlockMasksToRecompute.insert(MIB.getInstr()); } diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll new file mode 100644 index 000000000000..222c2f036ca8 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll @@ -0,0 +1,49 @@ +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -tail-predication=force-enabled-no-reductions -o - %s | FileCheck %s + +define arm_aapcs_vfpcc <16 x i8> @vcmp_vpst_combination(<16 x i8>* %pSrc, i16 zeroext %blockSize, i8* nocapture %pResult, i32* nocapture %pIndex) { +; CHECK-LABEL: vcmp_vpst_combination: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov.i8 q0, #0x7f +; CHECK-NEXT: dlstp.8 lr, r1 +; CHECK-NEXT: .LBB0_1: @ %do.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.u8 q1, [r0] +; CHECK-NEXT: vpt.s8 ge, q0, q1 +; CHECK-NEXT: vmovt q0, q1 +; CHECK-NEXT: letp lr, .LBB0_1 +; CHECK-NEXT: @ %bb.2: @ %do.end +; CHECK-NEXT: pop {r7, pc} +entry: + %conv = zext i16 %blockSize to i32 + %0 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vidup.v16i8(i32 0, i32 1) + %1 = extractvalue { <16 x i8>, i32 } %0, 0 + br label %do.body + +do.body: ; preds = %do.body, %entry + %indexVec.0 = phi <16 x i8> [ %1, %entry ], [ %add, %do.body ] + %curExtremIdxVec.0 = phi <16 x i8> [ zeroinitializer, %entry ], [ %6, %do.body ] + %curExtremValVec.0 = phi <16 x i8> [ , %entry ], [ %6, %do.body ] + %blkCnt.0 = phi i32 [ %conv, %entry ], [ %sub2, %do.body ] + %2 = tail call <16 x i1> @llvm.arm.mve.vctp8(i32 %blkCnt.0) + %3 = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %pSrc, i32 1, <16 x i1> %2, <16 x i8> zeroinitializer) + %4 = icmp sle <16 x i8> %3, %curExtremValVec.0 + %5 = and <16 x i1> %4, %2 + %6 = tail call <16 x i8> @llvm.arm.mve.orr.predicated.v16i8.v16i1(<16 x i8> %3, <16 x i8> %3, <16 x i1> %5, <16 x i8> %curExtremValVec.0) + %add = add <16 x i8> %indexVec.0, + %sub2 = add nsw i32 %blkCnt.0, -16 + %cmp = icmp sgt i32 %blkCnt.0, 16 + br i1 %cmp, label %do.body, label %do.end + +do.end: ; preds = %do.body + ret <16 x i8> %6 +} + +declare { <16 x i8>, i32 } @llvm.arm.mve.vidup.v16i8(i32, i32) + +declare <16 x i1> @llvm.arm.mve.vctp8(i32) + +declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) + +declare <16 x i8> @llvm.arm.mve.orr.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>)