llvm-project/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir

# RUN: llc -mtriple=thumbv8.1m.main -mattr=+lob -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s

# There are 2 SUBS, and the 2nd one is identified as the def.
# Thus, the 1st is a use, and we shouldn't optimise away the SUBS.

# CHECK:  bb.1.vector.body:
# CHECK:    renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
# CHECK:    renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
# CHECK:    $lr = MVE_LETP renamable $lr, %bb.1

--- |
  target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
  target triple = "thumbv8.1m.main-arm-unknown-eabi"

  define dso_local void @use_before_def(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
  entry:
    %cmp8 = icmp sgt i32 %N, 0
    %0 = add i32 %N, 3
    %1 = lshr i32 %0, 2
    %2 = shl nuw i32 %1, 2
    %3 = add i32 %2, -4
    %4 = lshr i32 %3, 2
    %5 = add nuw nsw i32 %4, 1
    br i1 %cmp8, label %vector.ph, label %for.cond.cleanup

  vector.ph:                                        ; preds = %entry
    call void @llvm.set.loop.iterations.i32(i32 %5)
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %vector.ph
    %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
    %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
    %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
    %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ]
    %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
    %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
    %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
    %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
    %8 = call <4 x i1> @llvm.arm.vctp32(i32 %7)
    %9 = sub i32 %7, 4
    %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3
    %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3
    %10 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %8), !tbaa !3
    %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
    %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
    %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4
    %11 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1)
    %12 = icmp ne i32 %11, 0
    br i1 %12, label %vector.body, label %for.cond.cleanup, !llvm.loop !7

  for.cond.cleanup:                                 ; preds = %vector.body, %entry
    ret void
  }
  declare void @llvm.set.loop.iterations.i32(i32) #1
  declare <4 x i1> @llvm.arm.vctp32(i32) #2
  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1
  declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #3
  declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #4
  declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #3
  declare void @llvm.stackprotector(i8*, i8**) #5

  attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fpregs,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="true" "use-soft-float"="false" }
  attributes #1 = { noduplicate nounwind }
  attributes #2 = { nounwind readnone }
  attributes #3 = { argmemonly nounwind willreturn }
  attributes #4 = { argmemonly nounwind readonly willreturn }
  attributes #5 = { nounwind }

  !llvm.module.flags = !{!0, !1}
  !llvm.ident = !{!2}

  !0 = !{i32 1, !"wchar_size", i32 4}
  !1 = !{i32 1, !"min_enum_size", i32 4}
  !2 = !{!"clang version 10.0.0 (http://github.com/llvm/llvm-project 2589b6d9edda73280fe1dc1d944ee34e22fe9a6f)"}
  !3 = !{!4, !4, i64 0}
  !4 = !{!"int", !5, i64 0}
  !5 = !{!"omnipotent char", !6, i64 0}
  !6 = !{!"Simple C++ TBAA"}
  !7 = distinct !{!7, !8}
  !8 = !{!"llvm.loop.isvectorized", i32 1}

...
---
name:            use_before_def
alignment:       2
exposesReturnsTwice: false
legalized:       false
regBankSelected: false
selected:        false
failedISel:      false
tracksRegLiveness: true
hasWinCFI:       false
registers:       []
liveins:
  - { reg: '$r0', virtual-reg: '' }
  - { reg: '$r1', virtual-reg: '' }
  - { reg: '$r2', virtual-reg: '' }
  - { reg: '$r3', virtual-reg: '' }
frameInfo:
  isFrameAddressTaken: false
  isReturnAddressTaken: false
  hasStackMap:     false
  hasPatchPoint:   false
  stackSize:       8
  offsetAdjustment: 0
  maxAlignment:    4
  adjustsStack:    false
  hasCalls:        false
  stackProtector:  ''
  maxCallFrameSize: 0
  cvBytesOfCalleeSavedRegisters: 0
  hasOpaqueSPAdjustment: false
  hasVAStart:      false
  hasMustTailInVarArgFunc: false
  localFrameSize:  0
  savePoint:       ''
  restorePoint:    ''
fixedStack:      []
stack:
  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
      stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
callSites:       []
constants:       []
machineFunctionInfo: {}
body:             |
  bb.0.entry:
    successors: %bb.1(0x80000000)
    liveins: $r0, $r1, $r2, $r3, $lr

    frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp
    frame-setup CFI_INSTRUCTION def_cfa_offset 8
    frame-setup CFI_INSTRUCTION offset $lr, -4
    frame-setup CFI_INSTRUCTION offset $r7, -8
    $r7 = frame-setup tMOVr $sp, 14, $noreg
    frame-setup CFI_INSTRUCTION def_cfa_register $r7
    tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr
    t2IT 11, 8, implicit-def $itstate
    tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate
    renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg
    renamable $lr = t2MOVi 1, 14, $noreg, $noreg
    renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
    renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
    renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
    t2DoLoopStart renamable $lr

  bb.1.vector.body:
    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
    liveins: $lr, $r0, $r1, $r2, $r3

    renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
    MVE_VPST 4, implicit $vpr
    renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4, !tbaa !3)
    renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4, !tbaa !3)
    renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
    renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
    MVE_VPST 8, implicit $vpr
    renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4, !tbaa !3)
    renamable $lr = t2LoopDec killed renamable $lr, 1
    renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
    t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr
    tB %bb.2, 14, $noreg

  bb.2.for.cond.cleanup:
    tPOP_RET 14, $noreg, def $r7, def $pc

...
[ARM][LowOverheadLoops] Remove dead loop update instructions. After creating a low-overhead loop, the loop update instruction was still lingering around hurting performance. This removes dead loop update instructions, which in our case are mostly SUBS instructions. To support this, some helper functions were added to MachineLoopUtils and ReachingDefAnalysis to analyse live-ins of loop exit blocks and find uses before a particular loop instruction, respectively. This is a first version that removes a SUBS instruction when there are no other uses inside and outside the loop block, but there are some more interesting cases in test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll which shows that there is room for improvement. For example, we can't handle this case yet: .. dlstp.32 lr, r2 .LBB0_1: mov r3, r2 subs r2, #4 vldrh.u32 q2, [r1], #8 vmov q1, q0 vmla.u32 q0, q2, r0 letp lr, .LBB0_1 @ %bb.2: vctp.32 r3 .. which is a lot more tricky because r2 is not only used by the subs, but also by the mov to r3, which is used outside the low-overhead loop by the vctp instruction, and that requires a bit of a different approach, and I will follow up on this. Differential Revision: https://reviews.llvm.org/D71007 2019-12-11 18:11:48 +08:00			`# RUN: llc -mtriple=thumbv8.1m.main -mattr=+lob -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - \| FileCheck %s`

			`# There are 2 SUBS, and the 2nd one is identified as the def.`
			`# Thus, the 1st is a use, and we shouldn't optimise away the SUBS.`

			`# CHECK: bb.1.vector.body:`
			`# CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg`
			`# CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg`
			`# CHECK: $lr = MVE_LETP renamable $lr, %bb.1`

			`--- \|`
			`target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"`
			`target triple = "thumbv8.1m.main-arm-unknown-eabi"`

			`define dso_local void @use_before_def(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {`
			`entry:`
			`%cmp8 = icmp sgt i32 %N, 0`
			`%0 = add i32 %N, 3`
			`%1 = lshr i32 %0, 2`
			`%2 = shl nuw i32 %1, 2`
			`%3 = add i32 %2, -4`
			`%4 = lshr i32 %3, 2`
			`%5 = add nuw nsw i32 %4, 1`
			`br i1 %cmp8, label %vector.ph, label %for.cond.cleanup`

			`vector.ph: ; preds = %entry`
			`call void @llvm.set.loop.iterations.i32(i32 %5)`
			`br label %vector.body`

			`vector.body: ; preds = %vector.body, %vector.ph`
			`%lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]`
			`%lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]`
			`%lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]`
			`%6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ]`
			`%7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]`
			`%lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*`
			`%lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*`
			`%lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*`
			`%8 = call <4 x i1> @llvm.arm.vctp32(i32 %7)`
			`%9 = sub i32 %7, 4`
			`%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3`
			`%wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3`
			`%10 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load`
			`call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %8), !tbaa !3`
			`%scevgep = getelementptr i32, i32* %lsr.iv, i32 4`
			`%scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4`
			`%scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4`
			`%11 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1)`
			`%12 = icmp ne i32 %11, 0`
			`br i1 %12, label %vector.body, label %for.cond.cleanup, !llvm.loop !7`

			`for.cond.cleanup: ; preds = %vector.body, %entry`
			`ret void`
			`}`
			`declare void @llvm.set.loop.iterations.i32(i32) #1`
			`declare <4 x i1> @llvm.arm.vctp32(i32) #2`
			`declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1`
			`declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #3`
			`declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #4`
			`declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #3`
			`declare void @llvm.stackprotector(i8, i8*) #5`

			attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fpregs,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="true" "use-soft-float"="false" }
			`attributes #1 = { noduplicate nounwind }`
			`attributes #2 = { nounwind readnone }`
			`attributes #3 = { argmemonly nounwind willreturn }`
			`attributes #4 = { argmemonly nounwind readonly willreturn }`
			`attributes #5 = { nounwind }`

			`!llvm.module.flags = !{!0, !1}`
			`!llvm.ident = !{!2}`

			`!0 = !{i32 1, !"wchar_size", i32 4}`
			`!1 = !{i32 1, !"min_enum_size", i32 4}`
			`!2 = !{!"clang version 10.0.0 (http://github.com/llvm/llvm-project 2589b6d9edda73280fe1dc1d944ee34e22fe9a6f)"}`
			`!3 = !{!4, !4, i64 0}`
			`!4 = !{!"int", !5, i64 0}`
			`!5 = !{!"omnipotent char", !6, i64 0}`
			`!6 = !{!"Simple C++ TBAA"}`
			`!7 = distinct !{!7, !8}`
			`!8 = !{!"llvm.loop.isvectorized", i32 1}`

			`...`
			`---`
			`name: use_before_def`
			`alignment: 2`
			`exposesReturnsTwice: false`
			`legalized: false`
			`regBankSelected: false`
			`selected: false`
			`failedISel: false`
			`tracksRegLiveness: true`
			`hasWinCFI: false`
			`registers: []`
			`liveins:`
			`- { reg: '$r0', virtual-reg: '' }`
			`- { reg: '$r1', virtual-reg: '' }`
			`- { reg: '$r2', virtual-reg: '' }`
			`- { reg: '$r3', virtual-reg: '' }`
			`frameInfo:`
			`isFrameAddressTaken: false`
			`isReturnAddressTaken: false`
			`hasStackMap: false`
			`hasPatchPoint: false`
			`stackSize: 8`
			`offsetAdjustment: 0`
			`maxAlignment: 4`
			`adjustsStack: false`
			`hasCalls: false`
			`stackProtector: ''`
			`maxCallFrameSize: 0`
			`cvBytesOfCalleeSavedRegisters: 0`
			`hasOpaqueSPAdjustment: false`
			`hasVAStart: false`
			`hasMustTailInVarArgFunc: false`
			`localFrameSize: 0`
			`savePoint: ''`
			`restorePoint: ''`
			`fixedStack: []`
			`stack:`
			`- { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,`
			`stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,`
			`debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }`
			`- { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,`
			`stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,`
			`debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }`
			`callSites: []`
			`constants: []`
			`machineFunctionInfo: {}`
			`body: \|`
			`bb.0.entry:`
			`successors: %bb.1(0x80000000)`
			`liveins: $r0, $r1, $r2, $r3, $lr`

			`frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp`
			`frame-setup CFI_INSTRUCTION def_cfa_offset 8`
			`frame-setup CFI_INSTRUCTION offset $lr, -4`
			`frame-setup CFI_INSTRUCTION offset $r7, -8`
			`$r7 = frame-setup tMOVr $sp, 14, $noreg`
			`frame-setup CFI_INSTRUCTION def_cfa_register $r7`
			`tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr`
			`t2IT 11, 8, implicit-def $itstate`
			`tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate`
			`renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg`
			`renamable $lr = t2MOVi 1, 14, $noreg, $noreg`
			`renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg`
			`renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg`
			`renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg`
			`t2DoLoopStart renamable $lr`

			`bb.1.vector.body:`
			`successors: %bb.1(0x7c000000), %bb.2(0x04000000)`
			`liveins: $lr, $r0, $r1, $r2, $r3`

			`renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg`
			`MVE_VPST 4, implicit $vpr`
			`renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4, !tbaa !3)`
			`renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4, !tbaa !3)`
			`renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg`
			`renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0`
			`MVE_VPST 8, implicit $vpr`
			`renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4, !tbaa !3)`
			`renamable $lr = t2LoopDec killed renamable $lr, 1`
			`renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg`
			`t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr`
			`tB %bb.2, 14, $noreg`

			`bb.2.for.cond.cleanup:`
			`tPOP_RET 14, $noreg, def $r7, def $pc`

			`...`