From 11dee2eae2f7bba4b12aa1c10f59456238eb9a9d Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 12 Nov 2020 13:47:46 +0000
Subject: [PATCH] [ARM] Ensure CountReg definition dominates InsertPt when
 creating t2DoLoopStartTP

Of course there was something missing, in this case a check that the def
of the count register we are adding to a t2DoLoopStartTP would dominate
the insertion point.

In the future, when we remove some of these COPY's in between, the
t2DoLoopStartTP will always become the last instruction in the block,
preventing this from happening. In the meantime we need to check they
are created in a sensible order.

Differential Revision: https://reviews.llvm.org/D91287
---
 .../Target/ARM/MVEVPTOptimisationsPass.cpp    |   5 +
 .../count_dominates_start.mir                 | 214 ++++++++++++++++++
 2 files changed, 219 insertions(+)
 create mode 100644 llvm/test/CodeGen/Thumb2/LowOverheadLoops/count_dominates_start.mir

diff --git a/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp
index 86940516ca28..9cac8a8e1162 100644
--- a/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp
+++ b/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp
@@ -230,6 +230,11 @@ bool MVEVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML,
     if ((InsertPt != MBB->end() && !DT->dominates(&*InsertPt, &Use)) ||
         !DT->dominates(ML->getHeader(), Use.getParent()))
       InsertPt = &Use;
+  if (InsertPt != MBB->end() &&
+      !DT->dominates(MRI->getVRegDef(CountReg), &*InsertPt)) {
+    LLVM_DEBUG(dbgs() << "  InsertPt does not dominate CountReg!\n");
+    return false;
+  }
 
   MachineInstrBuilder MI = BuildMI(*MBB, InsertPt, LoopStart->getDebugLoc(),
                                    TII->get(ARM::t2DoLoopStartTP))
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/count_dominates_start.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/count_dominates_start.mir
new file mode 100644
index 000000000000..627bf5d2e199
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/count_dominates_start.mir
@@ -0,0 +1,214 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv8.1m.main-none-eabi -mattr=+lob -run-pass=arm-mve-vpt-opts %s -verify-machineinstrs -o - | FileCheck %s
+
+--- |
+
+  define i32 @test(i16* nocapture readonly %x, i16* nocapture readonly %y, i32 %n) {
+  entry:
+    %cmp10 = icmp sgt i32 %n, 0
+    %0 = add i32 %n, 7
+    %1 = lshr i32 %0, 3
+    %2 = shl nuw i32 %1, 3
+    %3 = add i32 %2, -8
+    %4 = lshr i32 %3, 3
+    %5 = add nuw nsw i32 %4, 1
+    br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
+
+  vector.ph:                                        ; preds = %entry
+    %6 = call i32 @llvm.start.loop.iterations.i32(i32 %5)
+    br label %vector.body
+
+  vector.body:                                      ; preds = %vector.body, %vector.ph
+    %lsr.iv3 = phi i16* [ %scevgep4, %vector.body ], [ %x, %vector.ph ]
+    %lsr.iv1 = phi i16* [ %scevgep, %vector.body ], [ %y, %vector.ph ]
+    %vec.phi = phi i32 [ 0, %vector.ph ], [ %16, %vector.body ]
+    %7 = phi i32 [ %6, %vector.ph ], [ %17, %vector.body ]
+    %8 = phi i32 [ %n, %vector.ph ], [ %10, %vector.body ]
+    %lsr.iv12 = bitcast i16* %lsr.iv1 to <8 x i16>*
+    %lsr.iv35 = bitcast i16* %lsr.iv3 to <8 x i16>*
+    %9 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %8)
+    %10 = sub i32 %8, 8
+    %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv35, i32 2, <8 x i1> %9, <8 x i16> undef)
+    %11 = sext <8 x i16> %wide.masked.load to <8 x i32>
+    %wide.masked.load13 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv12, i32 2, <8 x i1> %9, <8 x i16> undef)
+    %12 = sext <8 x i16> %wide.masked.load13 to <8 x i32>
+    %13 = mul nsw <8 x i32> %12, %11
+    %14 = select <8 x i1> %9, <8 x i32> %13, <8 x i32> zeroinitializer
+    %15 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %14)
+    %16 = add i32 %15, %vec.phi
+    %scevgep = getelementptr i16, i16* %lsr.iv1, i32 8
+    %scevgep4 = getelementptr i16, i16* %lsr.iv3, i32 8
+    %17 = call i32 @llvm.loop.decrement.reg.i32(i32 %7, i32 1)
+    %18 = icmp ne i32 %17, 0
+    br i1 %18, label %vector.body, label %for.cond.cleanup
+
+  for.cond.cleanup:                                 ; preds = %vector.body, %entry
+    %s.0.lcssa = phi i32 [ 0, %entry ], [ %16, %vector.body ]
+    ret i32 %s.0.lcssa
+  }
+
+  declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
+  declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
+  declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
+  declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
+  declare <8 x i1> @llvm.arm.mve.vctp16(i32)
+
+...
+---
+name:            test
+alignment:       2
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: rgpr, preferred-register: '' }
+  - { id: 1, class: gpr, preferred-register: '' }
+  - { id: 2, class: gprnopc, preferred-register: '' }
+  - { id: 3, class: gprnopc, preferred-register: '' }
+  - { id: 4, class: tgpreven, preferred-register: '' }
+  - { id: 5, class: gprlr, preferred-register: '' }
+  - { id: 6, class: rgpr, preferred-register: '' }
+  - { id: 7, class: gpr, preferred-register: '' }
+  - { id: 8, class: gpr, preferred-register: '' }
+  - { id: 9, class: gpr, preferred-register: '' }
+  - { id: 10, class: gpr, preferred-register: '' }
+  - { id: 11, class: gpr, preferred-register: '' }
+  - { id: 12, class: gpr, preferred-register: '' }
+  - { id: 13, class: gpr, preferred-register: '' }
+  - { id: 14, class: gpr, preferred-register: '' }
+  - { id: 15, class: gprnopc, preferred-register: '' }
+  - { id: 16, class: gpr, preferred-register: '' }
+  - { id: 17, class: rgpr, preferred-register: '' }
+  - { id: 18, class: rgpr, preferred-register: '' }
+  - { id: 19, class: rgpr, preferred-register: '' }
+  - { id: 20, class: rgpr, preferred-register: '' }
+  - { id: 21, class: gprnopc, preferred-register: '' }
+  - { id: 22, class: rgpr, preferred-register: '' }
+  - { id: 23, class: gpr, preferred-register: '' }
+  - { id: 24, class: gprlr, preferred-register: '' }
+  - { id: 25, class: rgpr, preferred-register: '' }
+  - { id: 26, class: vccr, preferred-register: '' }
+  - { id: 27, class: rgpr, preferred-register: '' }
+  - { id: 28, class: rgpr, preferred-register: '' }
+  - { id: 29, class: mqpr, preferred-register: '' }
+  - { id: 30, class: rgpr, preferred-register: '' }
+  - { id: 31, class: mqpr, preferred-register: '' }
+  - { id: 32, class: tgpreven, preferred-register: '' }
+  - { id: 33, class: gprlr, preferred-register: '' }
+  - { id: 34, class: gprlr, preferred-register: '' }
+  - { id: 35, class: gprnopc, preferred-register: '' }
+liveins:
+  - { reg: '$r0', virtual-reg: '%13' }
+  - { reg: '$r1', virtual-reg: '%14' }
+  - { reg: '$r2', virtual-reg: '%15' }
+body:             |
+  ; CHECK-LABEL: name: test
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.2(0x50000000), %bb.1(0x30000000)
+  ; CHECK:   liveins: $r0, $r1, $r2
+  ; CHECK:   [[COPY:%[0-9]+]]:gprnopc = COPY $r2
+  ; CHECK:   [[COPY1:%[0-9]+]]:gpr = COPY $r1
+  ; CHECK:   [[COPY2:%[0-9]+]]:gpr = COPY $r0
+  ; CHECK:   t2CMPri [[COPY]], 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK:   t2Bcc %bb.2, 10 /* CC::ge */, $cpsr
+  ; CHECK: bb.1:
+  ; CHECK:   successors: %bb.4(0x80000000)
+  ; CHECK:   [[t2MOVi:%[0-9]+]]:rgpr = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   [[COPY3:%[0-9]+]]:gpr = COPY [[t2MOVi]]
+  ; CHECK:   t2B %bb.4, 14 /* CC::al */, $noreg
+  ; CHECK: bb.2.vector.ph:
+  ; CHECK:   successors: %bb.3(0x80000000)
+  ; CHECK:   [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY]], 7, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   [[t2BICri:%[0-9]+]]:rgpr = t2BICri [[t2ADDri]], 7, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[t2BICri]], 8, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   [[t2MOVi1:%[0-9]+]]:rgpr = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   [[t2ADDrs:%[0-9]+]]:gprnopc = nuw nsw t2ADDrs [[t2MOVi1]], [[t2SUBri]], 27, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   [[COPY4:%[0-9]+]]:rgpr = COPY [[t2ADDrs]]
+  ; CHECK:   [[t2DoLoopStart:%[0-9]+]]:gprlr = t2DoLoopStart [[COPY4]]
+  ; CHECK:   [[t2MOVi2:%[0-9]+]]:rgpr = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   [[COPY5:%[0-9]+]]:gpr = COPY [[t2MOVi2]]
+  ; CHECK:   [[COPY6:%[0-9]+]]:gpr = COPY [[t2DoLoopStart]]
+  ; CHECK:   [[COPY7:%[0-9]+]]:gprnopc = COPY [[COPY]]
+  ; CHECK: bb.3.vector.body:
+  ; CHECK:   successors: %bb.3(0x7c000000), %bb.4(0x04000000)
+  ; CHECK:   [[PHI:%[0-9]+]]:gprnopc = PHI [[COPY2]], %bb.2, %10, %bb.3
+  ; CHECK:   [[PHI1:%[0-9]+]]:gprnopc = PHI [[COPY1]], %bb.2, %9, %bb.3
+  ; CHECK:   [[PHI2:%[0-9]+]]:tgpreven = PHI [[COPY5]], %bb.2, %8, %bb.3
+  ; CHECK:   [[PHI3:%[0-9]+]]:gprlr = PHI [[COPY6]], %bb.2, %11, %bb.3
+  ; CHECK:   [[PHI4:%[0-9]+]]:rgpr = PHI [[COPY7]], %bb.2, %7, %bb.3
+  ; CHECK:   [[MVE_VCTP16_:%[0-9]+]]:vccr = MVE_VCTP16 [[PHI4]], 0, $noreg
+  ; CHECK:   [[t2SUBri1:%[0-9]+]]:rgpr = t2SUBri [[PHI4]], 8, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   [[COPY8:%[0-9]+]]:gpr = COPY [[t2SUBri1]]
+  ; CHECK:   [[MVE_VLDRHU16_post:%[0-9]+]]:rgpr, [[MVE_VLDRHU16_post1:%[0-9]+]]:mqpr = MVE_VLDRHU16_post [[PHI]], 16, 1, [[MVE_VCTP16_]] :: (load 16 from %ir.lsr.iv35, align 2)
+  ; CHECK:   [[MVE_VLDRHU16_post2:%[0-9]+]]:rgpr, [[MVE_VLDRHU16_post3:%[0-9]+]]:mqpr = MVE_VLDRHU16_post [[PHI1]], 16, 1, [[MVE_VCTP16_]] :: (load 16 from %ir.lsr.iv12, align 2)
+  ; CHECK:   [[MVE_VMLADAVas16_:%[0-9]+]]:tgpreven = MVE_VMLADAVas16 [[PHI2]], killed [[MVE_VLDRHU16_post3]], killed [[MVE_VLDRHU16_post1]], 1, [[MVE_VCTP16_]]
+  ; CHECK:   [[COPY9:%[0-9]+]]:gpr = COPY [[MVE_VMLADAVas16_]]
+  ; CHECK:   [[COPY10:%[0-9]+]]:gpr = COPY [[MVE_VLDRHU16_post2]]
+  ; CHECK:   [[COPY11:%[0-9]+]]:gpr = COPY [[MVE_VLDRHU16_post]]
+  ; CHECK:   [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI3]], 1
+  ; CHECK:   [[COPY12:%[0-9]+]]:gpr = COPY [[t2LoopDec]]
+  ; CHECK:   t2LoopEnd [[t2LoopDec]], %bb.3, implicit-def dead $cpsr
+  ; CHECK:   t2B %bb.4, 14 /* CC::al */, $noreg
+  ; CHECK: bb.4.for.cond.cleanup:
+  ; CHECK:   [[PHI5:%[0-9]+]]:gpr = PHI [[COPY3]], %bb.1, [[COPY9]], %bb.3
+  ; CHECK:   $r0 = COPY [[PHI5]]
+  ; CHECK:   tBX_RET 14 /* CC::al */, $noreg, implicit $r0
+  bb.0.entry:
+    successors: %bb.1(0x50000000), %bb.4(0x30000000)
+    liveins: $r0, $r1, $r2
+
+    %15:gprnopc = COPY $r2
+    %14:gpr = COPY $r1
+    %13:gpr = COPY $r0
+    t2CMPri %15, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    t2Bcc %bb.1, 10 /* CC::ge */, $cpsr
+
+  bb.4:
+    successors: %bb.3(0x80000000)
+
+    %22:rgpr = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
+    %16:gpr = COPY %22
+    t2B %bb.3, 14 /* CC::al */, $noreg
+
+  bb.1.vector.ph:
+    successors: %bb.2(0x80000000)
+
+    %17:rgpr = t2ADDri %15, 7, 14 /* CC::al */, $noreg, $noreg
+    %18:rgpr = t2BICri %17, 7, 14 /* CC::al */, $noreg, $noreg
+    %19:rgpr = t2SUBri %18, 8, 14 /* CC::al */, $noreg, $noreg
+    %20:rgpr = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg
+    %21:gprnopc = nuw nsw t2ADDrs %20, %19, 27, 14 /* CC::al */, $noreg, $noreg
+    %0:rgpr = COPY %21
+    %24:gprlr = t2DoLoopStart %0
+    %25:rgpr = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
+    %23:gpr = COPY %25
+    %1:gpr = COPY %24
+    %35:gprnopc = COPY %15
+
+  bb.2.vector.body:
+    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
+
+    %2:gprnopc = PHI %13, %bb.1, %10, %bb.2
+    %3:gprnopc = PHI %14, %bb.1, %9, %bb.2
+    %4:tgpreven = PHI %23, %bb.1, %8, %bb.2
+    %5:gprlr = PHI %1, %bb.1, %11, %bb.2
+    %6:rgpr = PHI %35, %bb.1, %7, %bb.2
+    %26:vccr = MVE_VCTP16 %6, 0, $noreg
+    %27:rgpr = t2SUBri %6, 8, 14 /* CC::al */, $noreg, $noreg
+    %7:gpr = COPY %27
+    %28:rgpr, %29:mqpr = MVE_VLDRHU16_post %2, 16, 1, %26 :: (load 16 from %ir.lsr.iv35, align 2)
+    %30:rgpr, %31:mqpr = MVE_VLDRHU16_post %3, 16, 1, %26 :: (load 16 from %ir.lsr.iv12, align 2)
+    %32:tgpreven = MVE_VMLADAVas16 %4, killed %31, killed %29, 1, %26
+    %8:gpr = COPY %32
+    %9:gpr = COPY %30
+    %10:gpr = COPY %28
+    %33:gprlr = t2LoopDec %5, 1
+    %11:gpr = COPY %33
+    t2LoopEnd %33, %bb.2, implicit-def dead $cpsr
+    t2B %bb.3, 14 /* CC::al */, $noreg
+
+  bb.3.for.cond.cleanup:
+    %12:gpr = PHI %16, %bb.4, %8, %bb.2
+    $r0 = COPY %12
+    tBX_RET 14 /* CC::al */, $noreg, implicit $r0
+
+...