[ARM][LowOverheadLoops] Merge a VCMP and the new VPST into a VPT

There were cases where a VCMP and a VPST were merged even if the VCMP
didn't have the same defs of its operands as the VPST. This is fixed by
adding RDA checks for the defs. This however gave rise to cases where
the new VPST created would precede the un-merged VCMP and so would fail
a predicate mask assertion since the VCMP wasn't predicated. This was
solved by converting the VCMP to a VPT instead of inserting the new
VPST.

Differential Revision: https://reviews.llvm.org/D90461
This commit is contained in:
Sam Tebbs 2020-10-30 13:30:58 +00:00
parent 6e8a8c2d7e
commit 40a3f7e48d
2 changed files with 101 additions and 20 deletions

View File

@ -1530,22 +1530,25 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
// TODO: We could be producing more VPT blocks than necessary and could
// fold the newly created one into a proceeding one.
MachineInstr *Divergent = VPTState::getDivergent(Block);
for (auto I = ++MachineBasicBlock::iterator(Insts.front()),
E = ++MachineBasicBlock::iterator(Divergent); I != E; ++I)
MachineInstr *VPST = Insts.front();
auto DivergentNext = ++MachineBasicBlock::iterator(Divergent);
bool DivergentNextIsPredicated =
getVPTInstrPredicate(*DivergentNext) != ARMVCC::None;
for (auto I = ++MachineBasicBlock::iterator(VPST), E = DivergentNext;
I != E; ++I)
RemovePredicate(&*I);
// Check if the instruction defining vpr is a vcmp so it can be combined
// with the VPST This should be the divergent instruction
MachineInstr *VCMP = VCMPOpcodeToVPT(Divergent->getOpcode()) != 0
? Divergent
: nullptr;
MachineInstr *VCMP =
VCMPOpcodeToVPT(Divergent->getOpcode()) != 0 ? Divergent : nullptr;
MachineInstrBuilder MIB;
if (VCMP) {
// Combine the VPST and VCMP into a VPT
MIB = BuildMI(*Divergent->getParent(), Divergent,
Divergent->getDebugLoc(),
TII->get(VCMPOpcodeToVPT(VCMP->getOpcode())));
auto ReplaceVCMPWithVPT = [&]() {
// Replace the VCMP with a VPT
MachineInstrBuilder MIB = BuildMI(
*Divergent->getParent(), Divergent, Divergent->getDebugLoc(),
TII->get(VCMPOpcodeToVPT(VCMP->getOpcode())));
MIB.addImm(ARMVCC::Then);
// Register one
MIB.add(VCMP->getOperand(1));
@ -1555,18 +1558,31 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
MIB.add(VCMP->getOperand(3));
LLVM_DEBUG(dbgs()
<< "ARM Loops: Combining with VCMP to VPT: " << *MIB);
LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
LoLoop.ToRemove.insert(VCMP);
} else {
// Create a VPST (with a null mask for now, we'll recompute it later)
// or a VPT in case there was a VCMP right before it
MIB = BuildMI(*Divergent->getParent(), Divergent,
};
if (DivergentNextIsPredicated) {
// Insert a VPST at the divergent only if the next instruction
// would actually use it. A VCMP following a VPST can be
// merged into a VPT so do that instead if the VCMP exists.
if (!VCMP) {
// Create a VPST (with a null mask for now, we'll recompute it
// later)
MachineInstrBuilder MIB =
BuildMI(*Divergent->getParent(), Divergent,
Divergent->getDebugLoc(), TII->get(ARM::MVE_VPST));
MIB.addImm(0);
LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
MIB.addImm(0);
LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
} else {
// No RDA checks are necessary here since the VPST would have been
// directly before the VCMP
ReplaceVCMPWithVPT();
}
}
LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Insts.front());
LoLoop.ToRemove.insert(Insts.front());
LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
LoLoop.ToRemove.insert(VPST);
}
} else if (Block.containsVCTP()) {
// The vctp will be removed, so the block mask of the vp(s)t will need

View File

@ -1,3 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -tail-predication=force-enabled-no-reductions -o - %s | FileCheck %s
define arm_aapcs_vfpcc <16 x i8> @vcmp_vpst_combination(<16 x i8>* %pSrc, i16 zeroext %blockSize, i8* nocapture %pResult, i32* nocapture %pIndex) {
@ -40,6 +41,70 @@ do.end: ; preds = %do.body
ret <16 x i8> %6
}
define i32 @vcmp_new_vpst_combination(i32 %len, i32* nocapture readonly %arr) {
; CHECK-LABEL: vcmp_new_vpst_combination:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cmp r0, #1
; CHECK-NEXT: blt .LBB1_4
; CHECK-NEXT: @ %bb.1: @ %vector.ph
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: vmov.i32 q1, #0x1
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: dlstp.32 lr, r0
; CHECK-NEXT: .LBB1_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q2, [r1], #16
; CHECK-NEXT: vcmp.i32 ne, q2, zr
; CHECK-NEXT: vmov q2, q0
; CHECK-NEXT: vpst
; CHECK-NEXT: vmovt q2, q1
; CHECK-NEXT: vaddva.u32 r2, q2
; CHECK-NEXT: letp lr, .LBB1_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB1_4:
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp7 = icmp sgt i32 %len, 0
br i1 %cmp7, label %vector.ph, label %for.cond.cleanup
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %len, 3
%n.vec = and i32 %n.rnd.up, -4
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi i32 [ 0, %vector.ph ], [ %5, %vector.body ]
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %len)
%0 = getelementptr inbounds i32, i32* %arr, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
%2 = icmp ne <4 x i32> %wide.masked.load, zeroinitializer
%narrow = and <4 x i1> %active.lane.mask, %2
%3 = zext <4 x i1> %narrow to <4 x i32>
%4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %3)
%5 = add i32 %4, %vec.phi
%index.next = add i32 %index, 4
%6 = icmp eq i32 %index.next, %n.vec
br i1 %6, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
%count.0.lcssa = phi i32 [ 0, %entry ], [ %5, %vector.body ]
ret i32 %count.0.lcssa
}
declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
declare { <16 x i8>, i32 } @llvm.arm.mve.vidup.v16i8(i32, i32)
declare <16 x i1> @llvm.arm.mve.vctp8(i32)