[ARM] Ensure CountReg definition dominates InsertPt when creating t2DoLoopStartTP

Of course there was something missing, in this case a check that the def of the count register we are adding to a t2DoLoopStartTP would dominate the insertion point. In the future, when we remove some of these COPY's in between, the t2DoLoopStartTP will always become the last instruction in the block, preventing this from happening. In the meantime we need to check they are created in a sensible order. Differential Revision: https://reviews.llvm.org/D91287
2020-11-12 13:47:46 +00:00 · 2020-11-12 13:47:46 +00:00 · 11dee2eae2
parent ec63dfe368
commit 11dee2eae2
2 changed files with 219 additions and 0 deletions
--- a/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp
+++ b/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp
@ -230,6 +230,11 @@ bool MVEVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML,
    if ((InsertPt != MBB->end() && !DT->dominates(&*InsertPt, &Use)) ||
        !DT->dominates(ML->getHeader(), Use.getParent()))
      InsertPt = &Use;
+  if (InsertPt != MBB->end() &&
+      !DT->dominates(MRI->getVRegDef(CountReg), &*InsertPt)) {
+    LLVM_DEBUG(dbgs() << "  InsertPt does not dominate CountReg!\n");
+    return false;
+  }

  MachineInstrBuilder MI = BuildMI(*MBB, InsertPt, LoopStart->getDebugLoc(),
                                   TII->get(ARM::t2DoLoopStartTP))
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/count_dominates_start.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/count_dominates_start.mir
@ -0,0 +1,214 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv8.1m.main-none-eabi -mattr=+lob -run-pass=arm-mve-vpt-opts %s -verify-machineinstrs -o - | FileCheck %s
+
+--- |
+
+  define i32 @test(i16* nocapture readonly %x, i16* nocapture readonly %y, i32 %n) {
+  entry:
+    %cmp10 = icmp sgt i32 %n, 0
+    %0 = add i32 %n, 7
+    %1 = lshr i32 %0, 3
+    %2 = shl nuw i32 %1, 3
+    %3 = add i32 %2, -8
+    %4 = lshr i32 %3, 3
+    %5 = add nuw nsw i32 %4, 1
+    br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
+
+  vector.ph:                                        ; preds = %entry
+    %6 = call i32 @llvm.start.loop.iterations.i32(i32 %5)
+    br label %vector.body
+
+  vector.body:                                      ; preds = %vector.body, %vector.ph
+    %lsr.iv3 = phi i16* [ %scevgep4, %vector.body ], [ %x, %vector.ph ]
+    %lsr.iv1 = phi i16* [ %scevgep, %vector.body ], [ %y, %vector.ph ]
+    %vec.phi = phi i32 [ 0, %vector.ph ], [ %16, %vector.body ]
+    %7 = phi i32 [ %6, %vector.ph ], [ %17, %vector.body ]
+    %8 = phi i32 [ %n, %vector.ph ], [ %10, %vector.body ]
+    %lsr.iv12 = bitcast i16* %lsr.iv1 to <8 x i16>*
+    %lsr.iv35 = bitcast i16* %lsr.iv3 to <8 x i16>*
+    %9 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %8)
+    %10 = sub i32 %8, 8
+    %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv35, i32 2, <8 x i1> %9, <8 x i16> undef)
+    %11 = sext <8 x i16> %wide.masked.load to <8 x i32>
+    %wide.masked.load13 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv12, i32 2, <8 x i1> %9, <8 x i16> undef)
+    %12 = sext <8 x i16> %wide.masked.load13 to <8 x i32>
+    %13 = mul nsw <8 x i32> %12, %11
+    %14 = select <8 x i1> %9, <8 x i32> %13, <8 x i32> zeroinitializer
+    %15 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %14)
+    %16 = add i32 %15, %vec.phi
+    %scevgep = getelementptr i16, i16* %lsr.iv1, i32 8
+    %scevgep4 = getelementptr i16, i16* %lsr.iv3, i32 8
+    %17 = call i32 @llvm.loop.decrement.reg.i32(i32 %7, i32 1)
+    %18 = icmp ne i32 %17, 0
+    br i1 %18, label %vector.body, label %for.cond.cleanup
+
+  for.cond.cleanup:                                 ; preds = %vector.body, %entry
+    %s.0.lcssa = phi i32 [ 0, %entry ], [ %16, %vector.body ]
+    ret i32 %s.0.lcssa
+  }
+
+  declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
+  declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
+  declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+  declare i32 @llvm.start.loop.iterations.i32(i32)
+  declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
+  declare <8 x i1> @llvm.arm.mve.vctp16(i32)
+
+...
+---
+name:            test
+alignment:       2
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: rgpr, preferred-register: '' }
+  - { id: 1, class: gpr, preferred-register: '' }
+  - { id: 2, class: gprnopc, preferred-register: '' }
+  - { id: 3, class: gprnopc, preferred-register: '' }
+  - { id: 4, class: tgpreven, preferred-register: '' }
+  - { id: 5, class: gprlr, preferred-register: '' }
+  - { id: 6, class: rgpr, preferred-register: '' }
+  - { id: 7, class: gpr, preferred-register: '' }
+  - { id: 8, class: gpr, preferred-register: '' }
+  - { id: 9, class: gpr, preferred-register: '' }
+  - { id: 10, class: gpr, preferred-register: '' }
+  - { id: 11, class: gpr, preferred-register: '' }
+  - { id: 12, class: gpr, preferred-register: '' }
+  - { id: 13, class: gpr, preferred-register: '' }
+  - { id: 14, class: gpr, preferred-register: '' }
+  - { id: 15, class: gprnopc, preferred-register: '' }
+  - { id: 16, class: gpr, preferred-register: '' }
+  - { id: 17, class: rgpr, preferred-register: '' }
+  - { id: 18, class: rgpr, preferred-register: '' }
+  - { id: 19, class: rgpr, preferred-register: '' }
+  - { id: 20, class: rgpr, preferred-register: '' }
+  - { id: 21, class: gprnopc, preferred-register: '' }
+  - { id: 22, class: rgpr, preferred-register: '' }
+  - { id: 23, class: gpr, preferred-register: '' }
+  - { id: 24, class: gprlr, preferred-register: '' }
+  - { id: 25, class: rgpr, preferred-register: '' }
+  - { id: 26, class: vccr, preferred-register: '' }
+  - { id: 27, class: rgpr, preferred-register: '' }
+  - { id: 28, class: rgpr, preferred-register: '' }
+  - { id: 29, class: mqpr, preferred-register: '' }
+  - { id: 30, class: rgpr, preferred-register: '' }
+  - { id: 31, class: mqpr, preferred-register: '' }
+  - { id: 32, class: tgpreven, preferred-register: '' }
+  - { id: 33, class: gprlr, preferred-register: '' }
+  - { id: 34, class: gprlr, preferred-register: '' }
+  - { id: 35, class: gprnopc, preferred-register: '' }
+liveins:
+  - { reg: '$r0', virtual-reg: '%13' }
+  - { reg: '$r1', virtual-reg: '%14' }
+  - { reg: '$r2', virtual-reg: '%15' }
+body:             |
+  ; CHECK-LABEL: name: test
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.2(0x50000000), %bb.1(0x30000000)
+  ; CHECK:   liveins: $r0, $r1, $r2
+  ; CHECK:   [[COPY:%[0-9]+]]:gprnopc = COPY $r2
+  ; CHECK:   [[COPY1:%[0-9]+]]:gpr = COPY $r1
+  ; CHECK:   [[COPY2:%[0-9]+]]:gpr = COPY $r0
+  ; CHECK:   t2CMPri [[COPY]], 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK:   t2Bcc %bb.2, 10 /* CC::ge */, $cpsr
+  ; CHECK: bb.1:
+  ; CHECK:   successors: %bb.4(0x80000000)
+  ; CHECK:   [[t2MOVi:%[0-9]+]]:rgpr = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   [[COPY3:%[0-9]+]]:gpr = COPY [[t2MOVi]]
+  ; CHECK:   t2B %bb.4, 14 /* CC::al */, $noreg
+  ; CHECK: bb.2.vector.ph:
+  ; CHECK:   successors: %bb.3(0x80000000)
+  ; CHECK:   [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY]], 7, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   [[t2BICri:%[0-9]+]]:rgpr = t2BICri [[t2ADDri]], 7, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[t2BICri]], 8, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   [[t2MOVi1:%[0-9]+]]:rgpr = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   [[t2ADDrs:%[0-9]+]]:gprnopc = nuw nsw t2ADDrs [[t2MOVi1]], [[t2SUBri]], 27, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   [[COPY4:%[0-9]+]]:rgpr = COPY [[t2ADDrs]]
+  ; CHECK:   [[t2DoLoopStart:%[0-9]+]]:gprlr = t2DoLoopStart [[COPY4]]
+  ; CHECK:   [[t2MOVi2:%[0-9]+]]:rgpr = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   [[COPY5:%[0-9]+]]:gpr = COPY [[t2MOVi2]]
+  ; CHECK:   [[COPY6:%[0-9]+]]:gpr = COPY [[t2DoLoopStart]]
+  ; CHECK:   [[COPY7:%[0-9]+]]:gprnopc = COPY [[COPY]]
+  ; CHECK: bb.3.vector.body:
+  ; CHECK:   successors: %bb.3(0x7c000000), %bb.4(0x04000000)
+  ; CHECK:   [[PHI:%[0-9]+]]:gprnopc = PHI [[COPY2]], %bb.2, %10, %bb.3
+  ; CHECK:   [[PHI1:%[0-9]+]]:gprnopc = PHI [[COPY1]], %bb.2, %9, %bb.3
+  ; CHECK:   [[PHI2:%[0-9]+]]:tgpreven = PHI [[COPY5]], %bb.2, %8, %bb.3
+  ; CHECK:   [[PHI3:%[0-9]+]]:gprlr = PHI [[COPY6]], %bb.2, %11, %bb.3
+  ; CHECK:   [[PHI4:%[0-9]+]]:rgpr = PHI [[COPY7]], %bb.2, %7, %bb.3
+  ; CHECK:   [[MVE_VCTP16_:%[0-9]+]]:vccr = MVE_VCTP16 [[PHI4]], 0, $noreg
+  ; CHECK:   [[t2SUBri1:%[0-9]+]]:rgpr = t2SUBri [[PHI4]], 8, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   [[COPY8:%[0-9]+]]:gpr = COPY [[t2SUBri1]]
+  ; CHECK:   [[MVE_VLDRHU16_post:%[0-9]+]]:rgpr, [[MVE_VLDRHU16_post1:%[0-9]+]]:mqpr = MVE_VLDRHU16_post [[PHI]], 16, 1, [[MVE_VCTP16_]] :: (load 16 from %ir.lsr.iv35, align 2)
+  ; CHECK:   [[MVE_VLDRHU16_post2:%[0-9]+]]:rgpr, [[MVE_VLDRHU16_post3:%[0-9]+]]:mqpr = MVE_VLDRHU16_post [[PHI1]], 16, 1, [[MVE_VCTP16_]] :: (load 16 from %ir.lsr.iv12, align 2)
+  ; CHECK:   [[MVE_VMLADAVas16_:%[0-9]+]]:tgpreven = MVE_VMLADAVas16 [[PHI2]], killed [[MVE_VLDRHU16_post3]], killed [[MVE_VLDRHU16_post1]], 1, [[MVE_VCTP16_]]
+  ; CHECK:   [[COPY9:%[0-9]+]]:gpr = COPY [[MVE_VMLADAVas16_]]
+  ; CHECK:   [[COPY10:%[0-9]+]]:gpr = COPY [[MVE_VLDRHU16_post2]]
+  ; CHECK:   [[COPY11:%[0-9]+]]:gpr = COPY [[MVE_VLDRHU16_post]]
+  ; CHECK:   [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI3]], 1
+  ; CHECK:   [[COPY12:%[0-9]+]]:gpr = COPY [[t2LoopDec]]
+  ; CHECK:   t2LoopEnd [[t2LoopDec]], %bb.3, implicit-def dead $cpsr
+  ; CHECK:   t2B %bb.4, 14 /* CC::al */, $noreg
+  ; CHECK: bb.4.for.cond.cleanup:
+  ; CHECK:   [[PHI5:%[0-9]+]]:gpr = PHI [[COPY3]], %bb.1, [[COPY9]], %bb.3
+  ; CHECK:   $r0 = COPY [[PHI5]]
+  ; CHECK:   tBX_RET 14 /* CC::al */, $noreg, implicit $r0
+  bb.0.entry:
+    successors: %bb.1(0x50000000), %bb.4(0x30000000)
+    liveins: $r0, $r1, $r2
+
+    %15:gprnopc = COPY $r2
+    %14:gpr = COPY $r1
+    %13:gpr = COPY $r0
+    t2CMPri %15, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    t2Bcc %bb.1, 10 /* CC::ge */, $cpsr
+
+  bb.4:
+    successors: %bb.3(0x80000000)
+
+    %22:rgpr = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
+    %16:gpr = COPY %22
+    t2B %bb.3, 14 /* CC::al */, $noreg
+
+  bb.1.vector.ph:
+    successors: %bb.2(0x80000000)
+
+    %17:rgpr = t2ADDri %15, 7, 14 /* CC::al */, $noreg, $noreg
+    %18:rgpr = t2BICri %17, 7, 14 /* CC::al */, $noreg, $noreg
+    %19:rgpr = t2SUBri %18, 8, 14 /* CC::al */, $noreg, $noreg
+    %20:rgpr = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg
+    %21:gprnopc = nuw nsw t2ADDrs %20, %19, 27, 14 /* CC::al */, $noreg, $noreg
+    %0:rgpr = COPY %21
+    %24:gprlr = t2DoLoopStart %0
+    %25:rgpr = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
+    %23:gpr = COPY %25
+    %1:gpr = COPY %24
+    %35:gprnopc = COPY %15
+
+  bb.2.vector.body:
+    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
+
+    %2:gprnopc = PHI %13, %bb.1, %10, %bb.2
+    %3:gprnopc = PHI %14, %bb.1, %9, %bb.2
+    %4:tgpreven = PHI %23, %bb.1, %8, %bb.2
+    %5:gprlr = PHI %1, %bb.1, %11, %bb.2
+    %6:rgpr = PHI %35, %bb.1, %7, %bb.2
+    %26:vccr = MVE_VCTP16 %6, 0, $noreg
+    %27:rgpr = t2SUBri %6, 8, 14 /* CC::al */, $noreg, $noreg
+    %7:gpr = COPY %27
+    %28:rgpr, %29:mqpr = MVE_VLDRHU16_post %2, 16, 1, %26 :: (load 16 from %ir.lsr.iv35, align 2)
+    %30:rgpr, %31:mqpr = MVE_VLDRHU16_post %3, 16, 1, %26 :: (load 16 from %ir.lsr.iv12, align 2)
+    %32:tgpreven = MVE_VMLADAVas16 %4, killed %31, killed %29, 1, %26
+    %8:gpr = COPY %32
+    %9:gpr = COPY %30
+    %10:gpr = COPY %28
+    %33:gprlr = t2LoopDec %5, 1
+    %11:gpr = COPY %33
+    t2LoopEnd %33, %bb.2, implicit-def dead $cpsr
+    t2B %bb.3, 14 /* CC::al */, $noreg
+
+  bb.3.for.cond.cleanup:
+    %12:gpr = PHI %16, %bb.4, %8, %bb.2
+    $r0 = COPY %12
+    tBX_RET 14 /* CC::al */, $noreg, implicit $r0
+
+...