[ARM][LowOverheadLoops] Merge a VCMP and the new VPST into a VPT

There were cases where a VCMP and a VPST were merged even if the VCMP didn't have the same defs of its operands as the VPST. This is fixed by adding RDA checks for the defs. This however gave rise to cases where the new VPST created would precede the un-merged VCMP and so would fail a predicate mask assertion since the VCMP wasn't predicated. This was solved by converting the VCMP to a VPT instead of inserting the new VPST. Differential Revision: https://reviews.llvm.org/D90461
2020-10-30 13:30:58 +00:00 · 2020-10-30 13:30:58 +00:00 · 40a3f7e48d
parent 6e8a8c2d7e
commit 40a3f7e48d
2 changed files with 101 additions and 20 deletions
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@ -1530,22 +1530,25 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
        // TODO: We could be producing more VPT blocks than necessary and could
        // fold the newly created one into a proceeding one.
        MachineInstr *Divergent = VPTState::getDivergent(Block);
-        for (auto I = ++MachineBasicBlock::iterator(Insts.front()),
-             E = ++MachineBasicBlock::iterator(Divergent); I != E; ++I)
+        MachineInstr *VPST = Insts.front();
+        auto DivergentNext = ++MachineBasicBlock::iterator(Divergent);
+        bool DivergentNextIsPredicated =
+            getVPTInstrPredicate(*DivergentNext) != ARMVCC::None;
+
+        for (auto I = ++MachineBasicBlock::iterator(VPST), E = DivergentNext;
+             I != E; ++I)
          RemovePredicate(&*I);

        // Check if the instruction defining vpr is a vcmp so it can be combined
        // with the VPST This should be the divergent instruction
-        MachineInstr *VCMP = VCMPOpcodeToVPT(Divergent->getOpcode()) != 0
-          ? Divergent
-          : nullptr;
+        MachineInstr *VCMP =
+            VCMPOpcodeToVPT(Divergent->getOpcode()) != 0 ? Divergent : nullptr;

-        MachineInstrBuilder MIB;
-        if (VCMP) {
-          // Combine the VPST and VCMP into a VPT
-          MIB = BuildMI(*Divergent->getParent(), Divergent,
-                        Divergent->getDebugLoc(),
-                        TII->get(VCMPOpcodeToVPT(VCMP->getOpcode())));
+        auto ReplaceVCMPWithVPT = [&]() {
+          // Replace the VCMP with a VPT
+          MachineInstrBuilder MIB = BuildMI(
+              *Divergent->getParent(), Divergent, Divergent->getDebugLoc(),
+              TII->get(VCMPOpcodeToVPT(VCMP->getOpcode())));
          MIB.addImm(ARMVCC::Then);
          // Register one
          MIB.add(VCMP->getOperand(1));
@ -1555,18 +1558,31 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
          MIB.add(VCMP->getOperand(3));
          LLVM_DEBUG(dbgs()
                     << "ARM Loops: Combining with VCMP to VPT: " << *MIB);
+          LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
          LoLoop.ToRemove.insert(VCMP);
-        } else {
-          // Create a VPST (with a null mask for now, we'll recompute it later)
-          // or a VPT in case there was a VCMP right before it
-          MIB = BuildMI(*Divergent->getParent(), Divergent,
+        };
+
+        if (DivergentNextIsPredicated) {
+          // Insert a VPST at the divergent only if the next instruction
+          // would actually use it. A VCMP following a VPST can be
+          // merged into a VPT so do that instead if the VCMP exists.
+          if (!VCMP) {
+            // Create a VPST (with a null mask for now, we'll recompute it
+            // later)
+            MachineInstrBuilder MIB =
+                BuildMI(*Divergent->getParent(), Divergent,
                        Divergent->getDebugLoc(), TII->get(ARM::MVE_VPST));
-          MIB.addImm(0);
-          LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
+            MIB.addImm(0);
+            LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
+            LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
+          } else {
+            // No RDA checks are necessary here since the VPST would have been
+            // directly before the VCMP
+            ReplaceVCMPWithVPT();
+          }
        }
-        LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Insts.front());
-        LoLoop.ToRemove.insert(Insts.front());
-        LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
+        LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
+        LoLoop.ToRemove.insert(VPST);
      }
    } else if (Block.containsVCTP()) {
      // The vctp will be removed, so the block mask of the vp(s)t will need
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll
@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -tail-predication=force-enabled-no-reductions -o - %s | FileCheck %s

 define arm_aapcs_vfpcc <16 x i8> @vcmp_vpst_combination(<16 x i8>* %pSrc, i16 zeroext %blockSize, i8* nocapture %pResult, i32* nocapture %pIndex) {
@ -40,6 +41,70 @@ do.end:                                           ; preds = %do.body
  ret <16 x i8> %6
 }

+define i32 @vcmp_new_vpst_combination(i32 %len, i32* nocapture readonly %arr) {
+; CHECK-LABEL: vcmp_new_vpst_combination:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    cmp r0, #1
+; CHECK-NEXT:    blt .LBB1_4
+; CHECK-NEXT:  @ %bb.1: @ %vector.ph
+; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    vmov.i32 q1, #0x1
+; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    dlstp.32 lr, r0
+; CHECK-NEXT:  .LBB1_2: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
+; CHECK-NEXT:    vcmp.i32 ne, q2, zr
+; CHECK-NEXT:    vmov q2, q0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmovt q2, q1
+; CHECK-NEXT:    vaddva.u32 r2, q2
+; CHECK-NEXT:    letp lr, .LBB1_2
+; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
+; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:  .LBB1_4:
+; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  %cmp7 = icmp sgt i32 %len, 0
+  br i1 %cmp7, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %entry
+  %n.rnd.up = add i32 %len, 3
+  %n.vec = and i32 %n.rnd.up, -4
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi i32 [ 0, %vector.ph ], [ %5, %vector.body ]
+  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %len)
+  %0 = getelementptr inbounds i32, i32* %arr, i32 %index
+  %1 = bitcast i32* %0 to <4 x i32>*
+  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
+  %2 = icmp ne <4 x i32> %wide.masked.load, zeroinitializer
+  %narrow = and <4 x i1> %active.lane.mask, %2
+  %3 = zext <4 x i1> %narrow to <4 x i32>
+  %4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %3)
+  %5 = add i32 %4, %vec.phi
+  %index.next = add i32 %index, 4
+  %6 = icmp eq i32 %index.next, %n.vec
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  %count.0.lcssa = phi i32 [ 0, %entry ], [ %5, %vector.body ]
+  ret i32 %count.0.lcssa
+}
+
+declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
+
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
+
+declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+
 declare { <16 x i8>, i32 } @llvm.arm.mve.vidup.v16i8(i32, i32)

 declare <16 x i1> @llvm.arm.mve.vctp8(i32)