[ARM][MVE] Disallow VPSEL for tail predication

Due to the current way that we collect predicated instructions, we
can't easily handle vpsel in tail predicated loops. There are a
couple of issues:
1) It will use the VPR as a predicate operand, but doesn't have to be
   instead a VPT block, which means we can assert while building up
   the VPT block because we don't find another VPST to being a new
   one.
2) VPSEL still requires a VPR operand even after tail predicating,
   which means we can't remove it unless there is another
   instruction, such as vcmp, that can provide the VPR def.

The first issue should be a relatively simple fix in the logic of the
LowOverheadLoops pass, whereas the second will require us to
represent the 'implicit' tail predication with an explicit value.

Differential Revision: https://reviews.llvm.org/D72629
This commit is contained in:
Sam Parker 2020-01-14 11:02:32 +00:00
parent d6ea8ff0d7
commit e73b20c57d
7 changed files with 1199 additions and 4 deletions

View File

@ -5712,7 +5712,6 @@ def MVE_VPSEL : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), NoItinerary,
let Inst{4} = 0b0;
let Inst{3-1} = Qm{2-0};
let Inst{0} = 0b1;
let validForTailPredication = 1;
}
foreach suffix = ["s8", "s16", "s32", "u8", "u16", "u32",

View File

@ -485,6 +485,9 @@ void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils,
}
bool LowOverheadLoop::RecordVPTBlocks(MachineInstr* MI) {
if (CannotTailPredicate)
return false;
// Only support a single vctp.
if (isVCTP(MI) && VCTP)
return false;
@ -494,10 +497,20 @@ bool LowOverheadLoop::RecordVPTBlocks(MachineInstr* MI) {
VPTBlocks.emplace_back(MI, CurrentPredicate);
CurrentBlock = &VPTBlocks.back();
return true;
}
if (isVCTP(MI))
} else if (isVCTP(MI))
VCTP = MI;
else if (MI->getOpcode() == ARM::MVE_VPSEL ||
MI->getOpcode() == ARM::MVE_VPNOT)
return false;
// TODO: Allow VPSEL and VPNOT, we currently cannot because:
// 1) It will use the VPR as a predicate operand, but doesn't have to be
// instead a VPT block, which means we can assert while building up
// the VPT block because we don't find another VPST to being a new
// one.
// 2) VPSEL still requires a VPR operand even after tail predicating,
// which means we can't remove it unless there is another
// instruction, such as vcmp, that can provide the VPR def.
unsigned VPROpNum = MI->getNumOperands() - 1;
bool IsUse = false;

View File

@ -0,0 +1,235 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - --verify-machineinstrs | FileCheck %s
# Test that VPNOTs cannot be within a tail predicated loop.
--- |
define dso_local void @inloop_vpnot(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %c, i16* nocapture readonly %d, i32* nocapture %e, i32 %N) local_unnamed_addr #0 {
entry:
%cmp9 = icmp eq i32 %N, 0
%tmp = add i32 %N, 3
%tmp1 = lshr i32 %tmp, 2
%tmp2 = shl nuw i32 %tmp1, 2
%tmp3 = add i32 %tmp2, -4
%tmp4 = lshr i32 %tmp3, 2
%tmp5 = add nuw nsw i32 %tmp4, 1
br i1 %cmp9, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
call void @llvm.set.loop.iterations.i32(i32 %tmp5)
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
%lsr.iv.e = phi i32* [ %scevgep.e, %vector.body ], [ %e, %vector.ph ]
%lsr.iv.d = phi i16* [ %scevgep.d, %vector.body ], [ %d, %vector.ph ]
%lsr.iv.c = phi i16* [ %scevgep.c, %vector.body ], [ %c, %vector.ph ]
%lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
%lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %tmp14, %vector.body ]
%tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
%lsr.iv17 = bitcast i16* %lsr.iv to <4 x i16>*
%lsr.iv1820 = bitcast i16* %lsr.iv18 to <4 x i16>*
%lsr.iv1820.c = bitcast i16* %lsr.iv.c to <4 x i16>*
%lsr.iv17.d = bitcast i16* %lsr.iv.d to <4 x i16>*
%lsr.cast.e = bitcast i32* %lsr.iv.e to <4 x i32>*
%tmp8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %tmp7)
%tmp9 = sub i32 %tmp7, 4
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32>
%wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1820, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%tmp11 = sext <4 x i16> %wide.masked.load14 to <4 x i32>
%wide.masked.load.c = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1820.c, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%sext.load.c = sext <4 x i16> %wide.masked.load.c to <4 x i32>
%wide.masked.load.d = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17.d, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%sext.load.d = sext <4 x i16> %wide.masked.load.d to <4 x i32>
%tmp12 = mul nsw <4 x i32> %tmp11, %tmp10
%mul.2 = mul nsw <4 x i32> %sext.load.c, %sext.load.d
%tmp13 = add <4 x i32> %tmp12, %mul.2
%tmp14 = add <4 x i32> %tmp13, %vec.phi
%vpnot = xor <4 x i1> %tmp8, <i1 true, i1 true, i1 true, i1 true>
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp14, <4 x i32>* %lsr.cast.e, i32 4, <4 x i1> %vpnot)
%scevgep = getelementptr i16, i16* %lsr.iv, i32 4
%scevgep19 = getelementptr i16, i16* %lsr.iv18, i32 4
%scevgep.c = getelementptr i16, i16* %lsr.iv.c, i32 4
%scevgep.d = getelementptr i16, i16* %lsr.iv.d, i32 4
%scevgep.e = getelementptr i32, i32* %lsr.iv.e, i32 4
%tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1)
%tmp16 = icmp ne i32 %tmp15, 0
%lsr.iv.next = add nsw i32 %lsr.iv1, -1
br i1 %tmp16, label %vector.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
declare void @llvm.set.loop.iterations.i32(i32) #3
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
...
---
name: inloop_vpnot
alignment: 2
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
failedISel: false
tracksRegLiveness: true
hasWinCFI: false
registers: []
liveins:
- { reg: '$r0', virtual-reg: '' }
- { reg: '$r1', virtual-reg: '' }
- { reg: '$r2', virtual-reg: '' }
- { reg: '$r3', virtual-reg: '' }
frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
hasPatchPoint: false
stackSize: 16
offsetAdjustment: 0
maxAlignment: 4
adjustsStack: false
hasCalls: false
stackProtector: ''
maxCallFrameSize: 0
cvBytesOfCalleeSavedRegisters: 0
hasOpaqueSPAdjustment: false
hasVAStart: false
hasMustTailInVarArgFunc: false
localFrameSize: 0
savePoint: ''
restorePoint: ''
fixedStack:
- { id: 0, type: default, offset: 4, size: 4, alignment: 4, stack-id: default,
isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- { id: 1, type: default, offset: 0, size: 4, alignment: 8, stack-id: default,
isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
stack:
- { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- { id: 2, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4,
stack-id: default, callee-saved-register: '$r5', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- { id: 3, name: '', type: spill-slot, offset: -16, size: 4, alignment: 4,
stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
callSites: []
constants: []
machineFunctionInfo: {}
body: |
; CHECK-LABEL: name: inloop_vpnot
; CHECK: bb.0.entry:
; CHECK: successors: %bb.3(0x30000000), %bb.1(0x50000000)
; CHECK: liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r7, $lr
; CHECK: frame-setup tPUSH 14, $noreg, killed $r4, killed $r5, killed $r7, killed $lr, implicit-def $sp, implicit $sp
; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 16
; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
; CHECK: frame-setup CFI_INSTRUCTION offset $r5, -12
; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -16
; CHECK: renamable $r12 = t2LDRi12 $sp, 20, 14, $noreg :: (load 4 from %fixed-stack.1)
; CHECK: t2CMPri renamable $r12, 0, 14, $noreg, implicit-def $cpsr
; CHECK: tBcc %bb.3, 0, killed $cpsr
; CHECK: bb.1.vector.ph:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: liveins: $r0, $r1, $r2, $r3, $r12
; CHECK: renamable $lr = t2ADDri renamable $r12, 3, 14, $noreg, $noreg
; CHECK: renamable $r4, dead $cpsr = tMOVi8 1, 14, $noreg
; CHECK: renamable $lr = t2BICri killed renamable $lr, 3, 14, $noreg, $noreg
; CHECK: renamable $r5 = tLDRspi $sp, 4, 14, $noreg :: (load 4 from %fixed-stack.0, align 8)
; CHECK: renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg
; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg
; CHECK: $lr = t2DLS renamable $lr
; CHECK: $r4 = tMOVr killed $lr, 14, $noreg
; CHECK: bb.2.vector.body:
; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000)
; CHECK: liveins: $q0, $r0, $r1, $r2, $r3, $r4, $r5, $r12
; CHECK: renamable $vpr = MVE_VCTP32 renamable $r12, 0, $noreg
; CHECK: MVE_VPST 4, implicit $vpr
; CHECK: renamable $r3, renamable $q1 = MVE_VLDRHS32_post killed renamable $r3, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17.d, align 2)
; CHECK: renamable $r2, renamable $q2 = MVE_VLDRHS32_post killed renamable $r2, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv1820.c, align 2)
; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
; CHECK: MVE_VPST 4, implicit $vpr
; CHECK: renamable $r0, renamable $q2 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2)
; CHECK: renamable $r1, renamable $q3 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2)
; CHECK: renamable $q2 = nsw MVE_VMULi32 killed renamable $q3, killed renamable $q2, 0, $noreg, undef renamable $q2
; CHECK: $lr = tMOVr $r4, 14, $noreg
; CHECK: renamable $q1 = MVE_VADDi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
; CHECK: renamable $r4, dead $cpsr = nsw tSUBi8 killed $r4, 1, 14, $noreg
; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
; CHECK: renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
; CHECK: renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg
; CHECK: MVE_VPST 8, implicit $vpr
; CHECK: renamable $r5 = MVE_VSTRWU32_post renamable $q0, killed renamable $r5, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.cast.e, align 4)
; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2
; CHECK: bb.3.for.cond.cleanup:
; CHECK: tPOP_RET 14, $noreg, def $r4, def $r5, def $r7, def $pc
bb.0.entry:
successors: %bb.3(0x30000000), %bb.1(0x50000000)
liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r7, $lr
frame-setup tPUSH 14, $noreg, killed $r4, killed $r5, killed $r7, killed $lr, implicit-def $sp, implicit $sp
frame-setup CFI_INSTRUCTION def_cfa_offset 16
frame-setup CFI_INSTRUCTION offset $lr, -4
frame-setup CFI_INSTRUCTION offset $r7, -8
frame-setup CFI_INSTRUCTION offset $r5, -12
frame-setup CFI_INSTRUCTION offset $r4, -16
renamable $r12 = t2LDRi12 $sp, 20, 14, $noreg :: (load 4 from %fixed-stack.0)
t2CMPri renamable $r12, 0, 14, $noreg, implicit-def $cpsr
tBcc %bb.3, 0, killed $cpsr
bb.1.vector.ph:
successors: %bb.2(0x80000000)
liveins: $r0, $r1, $r2, $r3, $r12
renamable $lr = t2ADDri renamable $r12, 3, 14, $noreg, $noreg
renamable $r4, dead $cpsr = tMOVi8 1, 14, $noreg
renamable $lr = t2BICri killed renamable $lr, 3, 14, $noreg, $noreg
renamable $r5 = tLDRspi $sp, 4, 14, $noreg :: (load 4 from %fixed-stack.1, align 8)
renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg
renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg
t2DoLoopStart renamable $lr
$r4 = tMOVr killed $lr, 14, $noreg
bb.2.vector.body:
successors: %bb.2(0x7c000000), %bb.3(0x04000000)
liveins: $q0, $r0, $r1, $r2, $r3, $r4, $r5, $r12
renamable $vpr = MVE_VCTP32 renamable $r12, 0, $noreg
MVE_VPST 4, implicit $vpr
renamable $r3, renamable $q1 = MVE_VLDRHS32_post killed renamable $r3, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17.d, align 2)
renamable $r2, renamable $q2 = MVE_VLDRHS32_post killed renamable $r2, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv1820.c, align 2)
renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
MVE_VPST 4, implicit $vpr
renamable $r0, renamable $q2 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2)
renamable $r1, renamable $q3 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2)
renamable $q2 = nsw MVE_VMULi32 killed renamable $q3, killed renamable $q2, 0, $noreg, undef renamable $q2
$lr = tMOVr $r4, 14, $noreg
renamable $q1 = MVE_VADDi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
renamable $r4, dead $cpsr = nsw tSUBi8 killed $r4, 1, 14, $noreg
renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg
MVE_VPST 8, implicit $vpr
renamable $r5 = MVE_VSTRWU32_post renamable $q0, killed renamable $r5, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.cast.e, align 4)
renamable $lr = t2LoopDec killed renamable $lr, 1
t2LoopEnd killed renamable $lr, %bb.2, implicit-def dead $cpsr
tB %bb.3, 14, $noreg
bb.3.for.cond.cleanup:
tPOP_RET 14, $noreg, def $r4, def $r5, def $r7, def $pc
...

View File

@ -0,0 +1,235 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - --verify-machineinstrs | FileCheck %s
# Test that a predicated VPNOT cannot be in a tail predicated loop.
--- |
define dso_local void @inloop_vpnot(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %c, i16* nocapture readonly %d, i32* nocapture %e, i32 %N) local_unnamed_addr #0 {
entry:
%cmp9 = icmp eq i32 %N, 0
%tmp = add i32 %N, 3
%tmp1 = lshr i32 %tmp, 2
%tmp2 = shl nuw i32 %tmp1, 2
%tmp3 = add i32 %tmp2, -4
%tmp4 = lshr i32 %tmp3, 2
%tmp5 = add nuw nsw i32 %tmp4, 1
br i1 %cmp9, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
call void @llvm.set.loop.iterations.i32(i32 %tmp5)
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
%lsr.iv.e = phi i32* [ %scevgep.e, %vector.body ], [ %e, %vector.ph ]
%lsr.iv.d = phi i16* [ %scevgep.d, %vector.body ], [ %d, %vector.ph ]
%lsr.iv.c = phi i16* [ %scevgep.c, %vector.body ], [ %c, %vector.ph ]
%lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
%lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %tmp14, %vector.body ]
%tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
%lsr.iv17 = bitcast i16* %lsr.iv to <4 x i16>*
%lsr.iv1820 = bitcast i16* %lsr.iv18 to <4 x i16>*
%lsr.iv1820.c = bitcast i16* %lsr.iv.c to <4 x i16>*
%lsr.iv17.d = bitcast i16* %lsr.iv.d to <4 x i16>*
%lsr.cast.e = bitcast i32* %lsr.iv.e to <4 x i32>*
%tmp8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %tmp7)
%tmp9 = sub i32 %tmp7, 4
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32>
%wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1820, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%tmp11 = sext <4 x i16> %wide.masked.load14 to <4 x i32>
%wide.masked.load.c = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1820.c, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%sext.load.c = sext <4 x i16> %wide.masked.load.c to <4 x i32>
%wide.masked.load.d = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17.d, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%sext.load.d = sext <4 x i16> %wide.masked.load.d to <4 x i32>
%tmp12 = mul nsw <4 x i32> %tmp11, %tmp10
%mul.2 = mul nsw <4 x i32> %sext.load.c, %sext.load.d
%tmp13 = add <4 x i32> %tmp12, %mul.2
%tmp14 = add <4 x i32> %tmp13, %vec.phi
%vpnot = xor <4 x i1> %tmp8, <i1 true, i1 true, i1 true, i1 true>
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp14, <4 x i32>* %lsr.cast.e, i32 4, <4 x i1> %vpnot)
%scevgep = getelementptr i16, i16* %lsr.iv, i32 4
%scevgep19 = getelementptr i16, i16* %lsr.iv18, i32 4
%scevgep.c = getelementptr i16, i16* %lsr.iv.c, i32 4
%scevgep.d = getelementptr i16, i16* %lsr.iv.d, i32 4
%scevgep.e = getelementptr i32, i32* %lsr.iv.e, i32 4
%tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1)
%tmp16 = icmp ne i32 %tmp15, 0
%lsr.iv.next = add nsw i32 %lsr.iv1, -1
br i1 %tmp16, label %vector.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
declare void @llvm.set.loop.iterations.i32(i32) #3
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
...
---
name: inloop_vpnot
alignment: 2
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
failedISel: false
tracksRegLiveness: true
hasWinCFI: false
registers: []
liveins:
- { reg: '$r0', virtual-reg: '' }
- { reg: '$r1', virtual-reg: '' }
- { reg: '$r2', virtual-reg: '' }
- { reg: '$r3', virtual-reg: '' }
frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
hasPatchPoint: false
stackSize: 16
offsetAdjustment: 0
maxAlignment: 4
adjustsStack: false
hasCalls: false
stackProtector: ''
maxCallFrameSize: 0
cvBytesOfCalleeSavedRegisters: 0
hasOpaqueSPAdjustment: false
hasVAStart: false
hasMustTailInVarArgFunc: false
localFrameSize: 0
savePoint: ''
restorePoint: ''
fixedStack:
- { id: 0, type: default, offset: 4, size: 4, alignment: 4, stack-id: default,
isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- { id: 1, type: default, offset: 0, size: 4, alignment: 8, stack-id: default,
isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
stack:
- { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- { id: 2, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4,
stack-id: default, callee-saved-register: '$r5', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- { id: 3, name: '', type: spill-slot, offset: -16, size: 4, alignment: 4,
stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
callSites: []
constants: []
machineFunctionInfo: {}
body: |
; CHECK-LABEL: name: inloop_vpnot
; CHECK: bb.0.entry:
; CHECK: successors: %bb.3(0x30000000), %bb.1(0x50000000)
; CHECK: liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r7, $lr
; CHECK: frame-setup tPUSH 14, $noreg, killed $r4, killed $r5, killed $r7, killed $lr, implicit-def $sp, implicit $sp
; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 16
; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
; CHECK: frame-setup CFI_INSTRUCTION offset $r5, -12
; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -16
; CHECK: renamable $r12 = t2LDRi12 $sp, 20, 14, $noreg :: (load 4 from %fixed-stack.1)
; CHECK: t2CMPri renamable $r12, 0, 14, $noreg, implicit-def $cpsr
; CHECK: tBcc %bb.3, 0, killed $cpsr
; CHECK: bb.1.vector.ph:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: liveins: $r0, $r1, $r2, $r3, $r12
; CHECK: renamable $lr = t2ADDri renamable $r12, 3, 14, $noreg, $noreg
; CHECK: renamable $r4, dead $cpsr = tMOVi8 1, 14, $noreg
; CHECK: renamable $lr = t2BICri killed renamable $lr, 3, 14, $noreg, $noreg
; CHECK: renamable $r5 = tLDRspi $sp, 4, 14, $noreg :: (load 4 from %fixed-stack.0, align 8)
; CHECK: renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg
; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg
; CHECK: $lr = t2DLS renamable $lr
; CHECK: $r4 = tMOVr killed $lr, 14, $noreg
; CHECK: bb.2.vector.body:
; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000)
; CHECK: liveins: $q0, $r0, $r1, $r2, $r3, $r4, $r5, $r12
; CHECK: renamable $vpr = MVE_VCTP32 renamable $r12, 0, $noreg
; CHECK: MVE_VPST 4, implicit $vpr
; CHECK: renamable $r3, renamable $q1 = MVE_VLDRHS32_post killed renamable $r3, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17.d, align 2)
; CHECK: renamable $r2, renamable $q2 = MVE_VLDRHS32_post killed renamable $r2, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv1820.c, align 2)
; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
; CHECK: MVE_VPST 4, implicit $vpr
; CHECK: renamable $r0, renamable $q2 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2)
; CHECK: renamable $r1, renamable $q3 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2)
; CHECK: renamable $q2 = nsw MVE_VMULi32 killed renamable $q3, killed renamable $q2, 0, $noreg, undef renamable $q2
; CHECK: $lr = tMOVr $r4, 14, $noreg
; CHECK: renamable $q1 = MVE_VADDi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
; CHECK: renamable $r4, dead $cpsr = nsw tSUBi8 killed $r4, 1, 14, $noreg
; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
; CHECK: renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
; CHECK: MVE_VPST 4, implicit $vpr
; CHECK: renamable $vpr = MVE_VPNOT renamable $vpr, 0, killed renamable $vpr
; CHECK: renamable $r5 = MVE_VSTRWU32_post renamable $q0, killed renamable $r5, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.cast.e, align 4)
; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2
; CHECK: bb.3.for.cond.cleanup:
; CHECK: tPOP_RET 14, $noreg, def $r4, def $r5, def $r7, def $pc
bb.0.entry:
successors: %bb.3(0x30000000), %bb.1(0x50000000)
liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r7, $lr
frame-setup tPUSH 14, $noreg, killed $r4, killed $r5, killed $r7, killed $lr, implicit-def $sp, implicit $sp
frame-setup CFI_INSTRUCTION def_cfa_offset 16
frame-setup CFI_INSTRUCTION offset $lr, -4
frame-setup CFI_INSTRUCTION offset $r7, -8
frame-setup CFI_INSTRUCTION offset $r5, -12
frame-setup CFI_INSTRUCTION offset $r4, -16
renamable $r12 = t2LDRi12 $sp, 20, 14, $noreg :: (load 4 from %fixed-stack.0)
t2CMPri renamable $r12, 0, 14, $noreg, implicit-def $cpsr
tBcc %bb.3, 0, killed $cpsr
bb.1.vector.ph:
successors: %bb.2(0x80000000)
liveins: $r0, $r1, $r2, $r3, $r12
renamable $lr = t2ADDri renamable $r12, 3, 14, $noreg, $noreg
renamable $r4, dead $cpsr = tMOVi8 1, 14, $noreg
renamable $lr = t2BICri killed renamable $lr, 3, 14, $noreg, $noreg
renamable $r5 = tLDRspi $sp, 4, 14, $noreg :: (load 4 from %fixed-stack.1, align 8)
renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg
renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg
t2DoLoopStart renamable $lr
$r4 = tMOVr killed $lr, 14, $noreg
bb.2.vector.body:
successors: %bb.2(0x7c000000), %bb.3(0x04000000)
liveins: $q0, $r0, $r1, $r2, $r3, $r4, $r5, $r12
renamable $vpr = MVE_VCTP32 renamable $r12, 0, $noreg
MVE_VPST 4, implicit $vpr
renamable $r3, renamable $q1 = MVE_VLDRHS32_post killed renamable $r3, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17.d, align 2)
renamable $r2, renamable $q2 = MVE_VLDRHS32_post killed renamable $r2, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv1820.c, align 2)
renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
MVE_VPST 4, implicit $vpr
renamable $r0, renamable $q2 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2)
renamable $r1, renamable $q3 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2)
renamable $q2 = nsw MVE_VMULi32 killed renamable $q3, killed renamable $q2, 0, $noreg, undef renamable $q2
$lr = tMOVr $r4, 14, $noreg
renamable $q1 = MVE_VADDi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
renamable $r4, dead $cpsr = nsw tSUBi8 killed $r4, 1, 14, $noreg
renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
MVE_VPST 4, implicit $vpr
renamable $vpr = MVE_VPNOT renamable $vpr, 0, killed renamable $vpr
renamable $r5 = MVE_VSTRWU32_post renamable $q0, killed renamable $r5, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.cast.e, align 4)
renamable $lr = t2LoopDec killed renamable $lr, 1
t2LoopEnd killed renamable $lr, %bb.2, implicit-def dead $cpsr
tB %bb.3, 14, $noreg
bb.3.for.cond.cleanup:
tPOP_RET 14, $noreg, def $r4, def $r5, def $r7, def $pc
...

View File

@ -0,0 +1,235 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - --verify-machineinstrs | FileCheck %s
# Test that a VPNOT is not added to a max sized VPT block.
--- |
define dso_local void @inloop_vpnot(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %c, i16* nocapture readonly %d, i32* nocapture %e, i32 %N) local_unnamed_addr #0 {
entry:
%cmp9 = icmp eq i32 %N, 0
%tmp = add i32 %N, 3
%tmp1 = lshr i32 %tmp, 2
%tmp2 = shl nuw i32 %tmp1, 2
%tmp3 = add i32 %tmp2, -4
%tmp4 = lshr i32 %tmp3, 2
%tmp5 = add nuw nsw i32 %tmp4, 1
br i1 %cmp9, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
call void @llvm.set.loop.iterations.i32(i32 %tmp5)
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
%lsr.iv.e = phi i32* [ %scevgep.e, %vector.body ], [ %e, %vector.ph ]
%lsr.iv.d = phi i16* [ %scevgep.d, %vector.body ], [ %d, %vector.ph ]
%lsr.iv.c = phi i16* [ %scevgep.c, %vector.body ], [ %c, %vector.ph ]
%lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
%lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %tmp14, %vector.body ]
%tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
%lsr.iv17 = bitcast i16* %lsr.iv to <4 x i16>*
%lsr.iv1820 = bitcast i16* %lsr.iv18 to <4 x i16>*
%lsr.iv1820.c = bitcast i16* %lsr.iv.c to <4 x i16>*
%lsr.iv17.d = bitcast i16* %lsr.iv.d to <4 x i16>*
%lsr.cast.e = bitcast i32* %lsr.iv.e to <4 x i32>*
%tmp8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %tmp7)
%tmp9 = sub i32 %tmp7, 4
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32>
%wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1820, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%tmp11 = sext <4 x i16> %wide.masked.load14 to <4 x i32>
%wide.masked.load.c = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1820.c, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%sext.load.c = sext <4 x i16> %wide.masked.load.c to <4 x i32>
%wide.masked.load.d = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17.d, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%sext.load.d = sext <4 x i16> %wide.masked.load.d to <4 x i32>
%tmp12 = mul nsw <4 x i32> %tmp11, %tmp10
%mul.2 = mul nsw <4 x i32> %sext.load.c, %sext.load.d
%tmp13 = add <4 x i32> %tmp12, %mul.2
%tmp14 = add <4 x i32> %tmp13, %vec.phi
%vpnot = xor <4 x i1> %tmp8, <i1 true, i1 true, i1 true, i1 true>
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp14, <4 x i32>* %lsr.cast.e, i32 4, <4 x i1> %vpnot)
%scevgep = getelementptr i16, i16* %lsr.iv, i32 4
%scevgep19 = getelementptr i16, i16* %lsr.iv18, i32 4
%scevgep.c = getelementptr i16, i16* %lsr.iv.c, i32 4
%scevgep.d = getelementptr i16, i16* %lsr.iv.d, i32 4
%scevgep.e = getelementptr i32, i32* %lsr.iv.e, i32 4
%tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1)
%tmp16 = icmp ne i32 %tmp15, 0
%lsr.iv.next = add nsw i32 %lsr.iv1, -1
br i1 %tmp16, label %vector.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
declare void @llvm.set.loop.iterations.i32(i32) #3
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
...
---
name: inloop_vpnot
alignment: 2
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
failedISel: false
tracksRegLiveness: true
hasWinCFI: false
registers: []
liveins:
- { reg: '$r0', virtual-reg: '' }
- { reg: '$r1', virtual-reg: '' }
- { reg: '$r2', virtual-reg: '' }
- { reg: '$r3', virtual-reg: '' }
frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
hasPatchPoint: false
stackSize: 16
offsetAdjustment: 0
maxAlignment: 4
adjustsStack: false
hasCalls: false
stackProtector: ''
maxCallFrameSize: 0
cvBytesOfCalleeSavedRegisters: 0
hasOpaqueSPAdjustment: false
hasVAStart: false
hasMustTailInVarArgFunc: false
localFrameSize: 0
savePoint: ''
restorePoint: ''
fixedStack:
- { id: 0, type: default, offset: 4, size: 4, alignment: 4, stack-id: default,
isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- { id: 1, type: default, offset: 0, size: 4, alignment: 8, stack-id: default,
isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
stack:
- { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- { id: 2, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4,
stack-id: default, callee-saved-register: '$r5', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- { id: 3, name: '', type: spill-slot, offset: -16, size: 4, alignment: 4,
stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
callSites: []
constants: []
machineFunctionInfo: {}
body: |
; CHECK-LABEL: name: inloop_vpnot
; CHECK: bb.0.entry:
; CHECK: successors: %bb.3(0x30000000), %bb.1(0x50000000)
; CHECK: liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r7, $lr
; CHECK: frame-setup tPUSH 14, $noreg, killed $r4, killed $r5, killed $r7, killed $lr, implicit-def $sp, implicit $sp
; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 16
; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
; CHECK: frame-setup CFI_INSTRUCTION offset $r5, -12
; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -16
; CHECK: renamable $r12 = t2LDRi12 $sp, 20, 14, $noreg :: (load 4 from %fixed-stack.1)
; CHECK: t2CMPri renamable $r12, 0, 14, $noreg, implicit-def $cpsr
; CHECK: tBcc %bb.3, 0, killed $cpsr
; CHECK: bb.1.vector.ph:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: liveins: $r0, $r1, $r2, $r3, $r12
; CHECK: renamable $lr = t2ADDri renamable $r12, 3, 14, $noreg, $noreg
; CHECK: renamable $r4, dead $cpsr = tMOVi8 1, 14, $noreg
; CHECK: renamable $lr = t2BICri killed renamable $lr, 3, 14, $noreg, $noreg
; CHECK: renamable $r5 = tLDRspi $sp, 4, 14, $noreg :: (load 4 from %fixed-stack.0, align 8)
; CHECK: renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg
; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg
; CHECK: $lr = t2DLS renamable $lr
; CHECK: $r4 = tMOVr killed $lr, 14, $noreg
; CHECK: bb.2.vector.body:
; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000)
; CHECK: liveins: $q0, $r0, $r1, $r2, $r3, $r4, $r5, $r12
; CHECK: renamable $vpr = MVE_VCTP32 renamable $r12, 0, $noreg
; CHECK: MVE_VPST 4, implicit $vpr
; CHECK: renamable $r3, renamable $q1 = MVE_VLDRHS32_post killed renamable $r3, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17.d, align 2)
; CHECK: renamable $r2, renamable $q2 = MVE_VLDRHS32_post killed renamable $r2, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv1820.c, align 2)
; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
; CHECK: MVE_VPST 4, implicit $vpr
; CHECK: renamable $r0, renamable $q2 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2)
; CHECK: renamable $r1, renamable $q3 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2)
; CHECK: $lr = tMOVr $r4, 14, $noreg
; CHECK: renamable $r4, dead $cpsr = nsw tSUBi8 killed $r4, 1, 14, $noreg
; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
; CHECK: MVE_VPST 2, implicit $vpr
; CHECK: renamable $q2 = nsw MVE_VMULi32 killed renamable $q3, killed renamable $q2, 0, $noreg, undef renamable $q2
; CHECK: renamable $q1 = MVE_VADDi32 killed renamable $q2, killed renamable $q1, 0, renamable $vpr, undef renamable $q1
; CHECK: renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, renamable $vpr, undef renamable $q0
; CHECK: renamable $r5 = MVE_VSTRWU32_post renamable $q0, killed renamable $r5, 16, 1, renamable $vpr :: (store 16 into %ir.lsr.cast.e, align 4)
; CHECK: renamable $vpr = MVE_VPNOT renamable $vpr, 0, $noreg
; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2
; CHECK: bb.3.for.cond.cleanup:
; CHECK: tPOP_RET 14, $noreg, def $r4, def $r5, def $r7, def $pc
bb.0.entry:
successors: %bb.3(0x30000000), %bb.1(0x50000000)
liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r7, $lr
frame-setup tPUSH 14, $noreg, killed $r4, killed $r5, killed $r7, killed $lr, implicit-def $sp, implicit $sp
frame-setup CFI_INSTRUCTION def_cfa_offset 16
frame-setup CFI_INSTRUCTION offset $lr, -4
frame-setup CFI_INSTRUCTION offset $r7, -8
frame-setup CFI_INSTRUCTION offset $r5, -12
frame-setup CFI_INSTRUCTION offset $r4, -16
renamable $r12 = t2LDRi12 $sp, 20, 14, $noreg :: (load 4 from %fixed-stack.0)
t2CMPri renamable $r12, 0, 14, $noreg, implicit-def $cpsr
tBcc %bb.3, 0, killed $cpsr
bb.1.vector.ph:
successors: %bb.2(0x80000000)
liveins: $r0, $r1, $r2, $r3, $r12
renamable $lr = t2ADDri renamable $r12, 3, 14, $noreg, $noreg
renamable $r4, dead $cpsr = tMOVi8 1, 14, $noreg
renamable $lr = t2BICri killed renamable $lr, 3, 14, $noreg, $noreg
renamable $r5 = tLDRspi $sp, 4, 14, $noreg :: (load 4 from %fixed-stack.1, align 8)
renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg
renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg
t2DoLoopStart renamable $lr
$r4 = tMOVr killed $lr, 14, $noreg
bb.2.vector.body:
successors: %bb.2(0x7c000000), %bb.3(0x04000000)
liveins: $q0, $r0, $r1, $r2, $r3, $r4, $r5, $r12
renamable $vpr = MVE_VCTP32 renamable $r12, 0, $noreg
MVE_VPST 4, implicit $vpr
renamable $r3, renamable $q1 = MVE_VLDRHS32_post killed renamable $r3, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17.d, align 2)
renamable $r2, renamable $q2 = MVE_VLDRHS32_post killed renamable $r2, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv1820.c, align 2)
renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
MVE_VPST 4, implicit $vpr
renamable $r0, renamable $q2 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2)
renamable $r1, renamable $q3 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2)
$lr = tMOVr $r4, 14, $noreg
renamable $r4, dead $cpsr = nsw tSUBi8 killed $r4, 1, 14, $noreg
renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
MVE_VPST 2, implicit $vpr
renamable $q2 = nsw MVE_VMULi32 killed renamable $q3, killed renamable $q2, 0, $noreg, undef renamable $q2
renamable $q1 = MVE_VADDi32 killed renamable $q2, killed renamable $q1, 0, renamable $vpr, undef renamable $q1
renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, renamable $vpr, undef renamable $q0
renamable $r5 = MVE_VSTRWU32_post renamable $q0, killed renamable $r5, 16, 1, renamable $vpr :: (store 16 into %ir.lsr.cast.e, align 4)
renamable $vpr = MVE_VPNOT renamable $vpr, 0, $noreg
renamable $lr = t2LoopDec killed renamable $lr, 1
t2LoopEnd killed renamable $lr, %bb.2, implicit-def dead $cpsr
tB %bb.3, 14, $noreg
bb.3.for.cond.cleanup:
tPOP_RET 14, $noreg, def $r4, def $r5, def $r7, def $pc
...

View File

@ -0,0 +1,239 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - --verify-machineinstrs | FileCheck %s
# General test for vpsel exclusion from tail predication
--- |
define dso_local i32 @vpsel_after_vpt(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %c, i16* nocapture readonly %d, i32 %N) local_unnamed_addr #0 {
entry:
%cmp9 = icmp eq i32 %N, 0
%tmp = add i32 %N, 3
%tmp1 = lshr i32 %tmp, 2
%tmp2 = shl nuw i32 %tmp1, 2
%tmp3 = add i32 %tmp2, -4
%tmp4 = lshr i32 %tmp3, 2
%tmp5 = add nuw nsw i32 %tmp4, 1
br i1 %cmp9, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
call void @llvm.set.loop.iterations.i32(i32 %tmp5)
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
%lsr.iv.d = phi i16* [ %scevgep.d, %vector.body ], [ %d, %vector.ph ]
%lsr.iv.c = phi i16* [ %scevgep.c, %vector.body ], [ %c, %vector.ph ]
%lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
%lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %tmp14, %vector.body ]
%tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
%lsr.iv17 = bitcast i16* %lsr.iv to <4 x i16>*
%lsr.iv1820 = bitcast i16* %lsr.iv18 to <4 x i16>*
%lsr.iv1820.c = bitcast i16* %lsr.iv.c to <4 x i16>*
%lsr.iv17.d = bitcast i16* %lsr.iv.d to <4 x i16>*
%tmp8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %tmp7)
%tmp9 = sub i32 %tmp7, 4
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32>
%wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1820, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%tmp11 = sext <4 x i16> %wide.masked.load14 to <4 x i32>
%wide.masked.load.c = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1820.c, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%sext.load.c = sext <4 x i16> %wide.masked.load.c to <4 x i32>
%wide.masked.load.d = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17.d, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%sext.load.d = sext <4 x i16> %wide.masked.load.d to <4 x i32>
%tmp12 = mul nsw <4 x i32> %tmp11, %tmp10
%mul.2 = mul nsw <4 x i32> %sext.load.c, %sext.load.d
%tmp13 = add <4 x i32> %tmp12, %mul.2
%acc = add <4 x i32> %tmp13, %vec.phi
%tmp14 = select <4 x i1> %tmp8, <4 x i32> %acc, <4 x i32> %vec.phi
%scevgep = getelementptr i16, i16* %lsr.iv, i32 4
%scevgep19 = getelementptr i16, i16* %lsr.iv18, i32 4
%scevgep.c = getelementptr i16, i16* %lsr.iv.c, i32 4
%scevgep.d = getelementptr i16, i16* %lsr.iv.d, i32 4
%tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1)
%tmp16 = icmp ne i32 %tmp15, 0
%lsr.iv.next = add nsw i32 %lsr.iv1, -1
br i1 %tmp16, label %vector.body, label %middle.block
middle.block: ; preds = %vector.body
%tmp17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp14)
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %middle.block, %entry
%res.0.lcssa = phi i32 [ 0, %entry ], [ %tmp17, %middle.block ]
ret i32 %res.0.lcssa
}
declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #2
declare void @llvm.set.loop.iterations.i32(i32) #3
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
...
---
name: vpsel_after_vpt
alignment: 2
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
failedISel: false
tracksRegLiveness: true
hasWinCFI: false
registers: []
liveins:
- { reg: '$r0', virtual-reg: '' }
- { reg: '$r1', virtual-reg: '' }
- { reg: '$r2', virtual-reg: '' }
- { reg: '$r3', virtual-reg: '' }
frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
hasPatchPoint: false
stackSize: 16
offsetAdjustment: 0
maxAlignment: 4
adjustsStack: false
hasCalls: false
stackProtector: ''
maxCallFrameSize: 0
cvBytesOfCalleeSavedRegisters: 0
hasOpaqueSPAdjustment: false
hasVAStart: false
hasMustTailInVarArgFunc: false
localFrameSize: 0
savePoint: ''
restorePoint: ''
fixedStack:
- { id: 0, type: default, offset: 0, size: 4, alignment: 8, stack-id: default,
isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
stack:
- { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- { id: 2, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4,
stack-id: default, callee-saved-register: '$r5', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- { id: 3, name: '', type: spill-slot, offset: -16, size: 4, alignment: 4,
stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
callSites: []
constants: []
machineFunctionInfo: {}
body: |
; CHECK-LABEL: name: vpsel_after_vpt
; CHECK: bb.0.entry:
; CHECK: successors: %bb.4(0x30000000), %bb.1(0x50000000)
; CHECK: liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r7, $lr
; CHECK: frame-setup tPUSH 14, $noreg, killed $r4, killed $r5, killed $r7, killed $lr, implicit-def $sp, implicit $sp
; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 16
; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
; CHECK: frame-setup CFI_INSTRUCTION offset $r5, -12
; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -16
; CHECK: renamable $r12 = t2LDRi12 $sp, 16, 14, $noreg :: (load 4 from %fixed-stack.0, align 8)
; CHECK: t2CMPri renamable $r12, 0, 14, $noreg, implicit-def $cpsr
; CHECK: tBcc %bb.4, 0, killed $cpsr
; CHECK: bb.1.vector.ph:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: liveins: $r0, $r1, $r2, $r3, $r12
; CHECK: renamable $lr = t2ADDri renamable $r12, 3, 14, $noreg, $noreg
; CHECK: renamable $r4, dead $cpsr = tMOVi8 1, 14, $noreg
; CHECK: renamable $lr = t2BICri killed renamable $lr, 3, 14, $noreg, $noreg
; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
; CHECK: renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg
; CHECK: renamable $r5 = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg
; CHECK: $lr = t2DLS renamable $r5
; CHECK: $r4 = tMOVr killed $r5, 14, $noreg
; CHECK: bb.2.vector.body:
; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000)
; CHECK: liveins: $q0, $r0, $r1, $r2, $r3, $r4, $r12
; CHECK: renamable $vpr = MVE_VCTP32 renamable $r12, 0, $noreg
; CHECK: MVE_VPST 4, implicit $vpr
; CHECK: renamable $r3, renamable $q1 = MVE_VLDRHS32_post killed renamable $r3, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17.d, align 2)
; CHECK: renamable $r2, renamable $q2 = MVE_VLDRHS32_post killed renamable $r2, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv1820.c, align 2)
; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
; CHECK: MVE_VPST 4, implicit $vpr
; CHECK: renamable $r0, renamable $q2 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2)
; CHECK: renamable $r1, renamable $q3 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2)
; CHECK: renamable $q2 = nsw MVE_VMULi32 killed renamable $q3, killed renamable $q2, 0, $noreg, undef renamable $q2
; CHECK: $lr = tMOVr $r4, 14, $noreg
; CHECK: renamable $q1 = MVE_VADDi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
; CHECK: renamable $r4, dead $cpsr = nsw tSUBi8 killed $r4, 1, 14, $noreg
; CHECK: renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1
; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
; CHECK: renamable $q0 = MVE_VPSEL killed renamable $q1, killed renamable $q0, 0, killed renamable $vpr
; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2
; CHECK: bb.3.middle.block:
; CHECK: liveins: $q0
; CHECK: renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg
; CHECK: tPOP_RET 14, $noreg, def $r4, def $r5, def $r7, def $pc, implicit killed $r0
; CHECK: bb.4:
; CHECK: renamable $r0, dead $cpsr = tMOVi8 0, 14, $noreg
; CHECK: tPOP_RET 14, $noreg, def $r4, def $r5, def $r7, def $pc, implicit killed $r0
bb.0.entry:
successors: %bb.4(0x30000000), %bb.1(0x50000000)
liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r7, $lr
frame-setup tPUSH 14, $noreg, killed $r4, killed $r5, killed $r7, killed $lr, implicit-def $sp, implicit $sp
frame-setup CFI_INSTRUCTION def_cfa_offset 16
frame-setup CFI_INSTRUCTION offset $lr, -4
frame-setup CFI_INSTRUCTION offset $r7, -8
frame-setup CFI_INSTRUCTION offset $r5, -12
frame-setup CFI_INSTRUCTION offset $r4, -16
renamable $r12 = t2LDRi12 $sp, 16, 14, $noreg :: (load 4 from %fixed-stack.0, align 8)
t2CMPri renamable $r12, 0, 14, $noreg, implicit-def $cpsr
tBcc %bb.4, 0, killed $cpsr
bb.1.vector.ph:
successors: %bb.2(0x80000000)
liveins: $r0, $r1, $r2, $r3, $r12
renamable $lr = t2ADDri renamable $r12, 3, 14, $noreg, $noreg
renamable $r4, dead $cpsr = tMOVi8 1, 14, $noreg
renamable $lr = t2BICri killed renamable $lr, 3, 14, $noreg, $noreg
renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg
renamable $r5 = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg
t2DoLoopStart renamable $r5
$r4 = tMOVr killed $r5, 14, $noreg
bb.2.vector.body:
successors: %bb.2(0x7c000000), %bb.3(0x04000000)
liveins: $q0, $r0, $r1, $r2, $r3, $r4, $r12
renamable $vpr = MVE_VCTP32 renamable $r12, 0, $noreg
MVE_VPST 4, implicit $vpr
renamable $r3, renamable $q1 = MVE_VLDRHS32_post killed renamable $r3, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17.d, align 2)
renamable $r2, renamable $q2 = MVE_VLDRHS32_post killed renamable $r2, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv1820.c, align 2)
renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
MVE_VPST 4, implicit $vpr
renamable $r0, renamable $q2 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2)
renamable $r1, renamable $q3 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2)
renamable $q2 = nsw MVE_VMULi32 killed renamable $q3, killed renamable $q2, 0, $noreg, undef renamable $q2
$lr = tMOVr $r4, 14, $noreg
renamable $q1 = MVE_VADDi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
renamable $r4, dead $cpsr = nsw tSUBi8 killed $r4, 1, 14, $noreg
renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1
renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
renamable $q0 = MVE_VPSEL killed renamable $q1, killed renamable $q0, 0, killed renamable $vpr
renamable $lr = t2LoopDec killed renamable $lr, 1
t2LoopEnd killed renamable $lr, %bb.2, implicit-def dead $cpsr
tB %bb.3, 14, $noreg
bb.3.middle.block:
liveins: $q0
renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg
tPOP_RET 14, $noreg, def $r4, def $r5, def $r7, def $pc, implicit killed $r0
bb.4:
renamable $r0, dead $cpsr = tMOVi8 0, 14, $noreg
tPOP_RET 14, $noreg, def $r4, def $r5, def $r7, def $pc, implicit killed $r0
...

View File

@ -0,0 +1,239 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - --verify-machineinstrs | FileCheck %s
# Test that, even if vpsels are allowed, that they aren't inserted into a vpt
# block - which would cause an assert here because of the number of insts in
# the block.
--- |
define dso_local i32 @vpsel_after_vpt(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %c, i16* nocapture readonly %d, i32 %N) local_unnamed_addr #0 {
entry:
%cmp9 = icmp eq i32 %N, 0
%tmp = add i32 %N, 3
%tmp1 = lshr i32 %tmp, 2
%tmp2 = shl nuw i32 %tmp1, 2
%tmp3 = add i32 %tmp2, -4
%tmp4 = lshr i32 %tmp3, 2
%tmp5 = add nuw nsw i32 %tmp4, 1
br i1 %cmp9, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
call void @llvm.set.loop.iterations.i32(i32 %tmp5)
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
%lsr.iv.d = phi i16* [ %scevgep.d, %vector.body ], [ %d, %vector.ph ]
%lsr.iv.c = phi i16* [ %scevgep.c, %vector.body ], [ %c, %vector.ph ]
%lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
%lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %tmp14, %vector.body ]
%tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
%lsr.iv17 = bitcast i16* %lsr.iv to <4 x i16>*
%lsr.iv1820 = bitcast i16* %lsr.iv18 to <4 x i16>*
%lsr.iv1820.c = bitcast i16* %lsr.iv.c to <4 x i16>*
%lsr.iv17.d = bitcast i16* %lsr.iv.d to <4 x i16>*
%tmp8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %tmp7)
%tmp9 = sub i32 %tmp7, 4
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32>
%wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1820, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%tmp11 = sext <4 x i16> %wide.masked.load14 to <4 x i32>
%wide.masked.load.c = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1820.c, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%sext.load.c = sext <4 x i16> %wide.masked.load.c to <4 x i32>
%wide.masked.load.d = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17.d, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%sext.load.d = sext <4 x i16> %wide.masked.load.d to <4 x i32>
%tmp12 = mul nsw <4 x i32> %tmp11, %tmp10
%mul.2 = mul nsw <4 x i32> %sext.load.c, %sext.load.d
%tmp13 = add <4 x i32> %tmp12, %mul.2
%acc = add <4 x i32> %tmp13, %vec.phi
%tmp14 = select <4 x i1> %tmp8, <4 x i32> %acc, <4 x i32> %vec.phi
%scevgep = getelementptr i16, i16* %lsr.iv, i32 4
%scevgep19 = getelementptr i16, i16* %lsr.iv18, i32 4
%scevgep.c = getelementptr i16, i16* %lsr.iv.c, i32 4
%scevgep.d = getelementptr i16, i16* %lsr.iv.d, i32 4
%tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1)
%tmp16 = icmp ne i32 %tmp15, 0
%lsr.iv.next = add nsw i32 %lsr.iv1, -1
br i1 %tmp16, label %vector.body, label %middle.block
middle.block: ; preds = %vector.body
%tmp17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp14)
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %middle.block, %entry
%res.0.lcssa = phi i32 [ 0, %entry ], [ %tmp17, %middle.block ]
ret i32 %res.0.lcssa
}
declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #2
declare void @llvm.set.loop.iterations.i32(i32) #3
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
...
---
name: vpsel_after_vpt
alignment: 2
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
failedISel: false
tracksRegLiveness: true
hasWinCFI: false
registers: []
liveins:
- { reg: '$r0', virtual-reg: '' }
- { reg: '$r1', virtual-reg: '' }
- { reg: '$r2', virtual-reg: '' }
- { reg: '$r3', virtual-reg: '' }
frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
hasPatchPoint: false
stackSize: 16
offsetAdjustment: 0
maxAlignment: 4
adjustsStack: false
hasCalls: false
stackProtector: ''
maxCallFrameSize: 0
cvBytesOfCalleeSavedRegisters: 0
hasOpaqueSPAdjustment: false
hasVAStart: false
hasMustTailInVarArgFunc: false
localFrameSize: 0
savePoint: ''
restorePoint: ''
fixedStack:
- { id: 0, type: default, offset: 0, size: 4, alignment: 8, stack-id: default,
isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
stack:
- { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- { id: 2, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4,
stack-id: default, callee-saved-register: '$r5', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- { id: 3, name: '', type: spill-slot, offset: -16, size: 4, alignment: 4,
stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
callSites: []
constants: []
machineFunctionInfo: {}
body: |
; CHECK-LABEL: name: vpsel_after_vpt
; CHECK: bb.0.entry:
; CHECK: successors: %bb.4(0x30000000), %bb.1(0x50000000)
; CHECK: liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r7, $lr
; CHECK: frame-setup tPUSH 14, $noreg, killed $r4, killed $r5, killed $r7, killed $lr, implicit-def $sp, implicit $sp
; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 16
; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
; CHECK: frame-setup CFI_INSTRUCTION offset $r5, -12
; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -16
; CHECK: renamable $r12 = t2LDRi12 $sp, 16, 14, $noreg :: (load 4 from %fixed-stack.0, align 8)
; CHECK: t2CMPri renamable $r12, 0, 14, $noreg, implicit-def $cpsr
; CHECK: tBcc %bb.4, 0, killed $cpsr
; CHECK: bb.1.vector.ph:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: liveins: $r0, $r1, $r2, $r3, $r12
; CHECK: renamable $lr = t2ADDri renamable $r12, 3, 14, $noreg, $noreg
; CHECK: renamable $r4, dead $cpsr = tMOVi8 1, 14, $noreg
; CHECK: renamable $lr = t2BICri killed renamable $lr, 3, 14, $noreg, $noreg
; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
; CHECK: renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg
; CHECK: renamable $r5 = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg
; CHECK: $lr = t2DLS renamable $r5
; CHECK: $r4 = tMOVr killed $r5, 14, $noreg
; CHECK: bb.2.vector.body:
; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000)
; CHECK: liveins: $q0, $r0, $r1, $r2, $r3, $r4, $r12
; CHECK: renamable $vpr = MVE_VCTP32 renamable $r12, 0, $noreg
; CHECK: MVE_VPST 2, implicit $vpr
; CHECK: renamable $r3, renamable $q1 = MVE_VLDRHS32_post killed renamable $r3, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17.d, align 2)
; CHECK: renamable $r2, renamable $q2 = MVE_VLDRHS32_post killed renamable $r2, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv1820.c, align 2)
; CHECK: renamable $r1, renamable $q3 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2)
; CHECK: renamable $r0, renamable $q4 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2)
; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
; CHECK: renamable $q2 = nsw MVE_VMULi32 killed renamable $q3, killed renamable $q4, 0, $noreg, undef renamable $q2
; CHECK: $lr = tMOVr $r4, 14, $noreg
; CHECK: renamable $q1 = MVE_VADDi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
; CHECK: renamable $r4, dead $cpsr = nsw tSUBi8 killed $r4, 1, 14, $noreg
; CHECK: renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1
; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
; CHECK: renamable $q0 = MVE_VPSEL killed renamable $q1, killed renamable $q0, 0, killed renamable $vpr
; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2
; CHECK: bb.3.middle.block:
; CHECK: liveins: $q0
; CHECK: renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg
; CHECK: tPOP_RET 14, $noreg, def $r4, def $r5, def $r7, def $pc, implicit killed $r0
; CHECK: bb.4:
; CHECK: renamable $r0, dead $cpsr = tMOVi8 0, 14, $noreg
; CHECK: tPOP_RET 14, $noreg, def $r4, def $r5, def $r7, def $pc, implicit killed $r0
bb.0.entry:
successors: %bb.4(0x30000000), %bb.1(0x50000000)
liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r7, $lr
frame-setup tPUSH 14, $noreg, killed $r4, killed $r5, killed $r7, killed $lr, implicit-def $sp, implicit $sp
frame-setup CFI_INSTRUCTION def_cfa_offset 16
frame-setup CFI_INSTRUCTION offset $lr, -4
frame-setup CFI_INSTRUCTION offset $r7, -8
frame-setup CFI_INSTRUCTION offset $r5, -12
frame-setup CFI_INSTRUCTION offset $r4, -16
renamable $r12 = t2LDRi12 $sp, 16, 14, $noreg :: (load 4 from %fixed-stack.0, align 8)
t2CMPri renamable $r12, 0, 14, $noreg, implicit-def $cpsr
tBcc %bb.4, 0, killed $cpsr
bb.1.vector.ph:
successors: %bb.2(0x80000000)
liveins: $r0, $r1, $r2, $r3, $r12
renamable $lr = t2ADDri renamable $r12, 3, 14, $noreg, $noreg
renamable $r4, dead $cpsr = tMOVi8 1, 14, $noreg
renamable $lr = t2BICri killed renamable $lr, 3, 14, $noreg, $noreg
renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg
renamable $r5 = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg
t2DoLoopStart renamable $r5
$r4 = tMOVr killed $r5, 14, $noreg
bb.2.vector.body:
successors: %bb.2(0x7c000000), %bb.3(0x04000000)
liveins: $q0, $r0, $r1, $r2, $r3, $r4, $r12
renamable $vpr = MVE_VCTP32 renamable $r12, 0, $noreg
MVE_VPST 2, implicit $vpr
renamable $r3, renamable $q1 = MVE_VLDRHS32_post killed renamable $r3, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17.d, align 2)
renamable $r2, renamable $q2 = MVE_VLDRHS32_post killed renamable $r2, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv1820.c, align 2)
renamable $r1, renamable $q3 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2)
renamable $r0, renamable $q4 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2)
renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
renamable $q2 = nsw MVE_VMULi32 killed renamable $q3, killed renamable $q4, 0, $noreg, undef renamable $q2
$lr = tMOVr $r4, 14, $noreg
renamable $q1 = MVE_VADDi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
renamable $r4, dead $cpsr = nsw tSUBi8 killed $r4, 1, 14, $noreg
renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1
renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
renamable $q0 = MVE_VPSEL killed renamable $q1, killed renamable $q0, 0, killed renamable $vpr
renamable $lr = t2LoopDec killed renamable $lr, 1
t2LoopEnd killed renamable $lr, %bb.2, implicit-def dead $cpsr
tB %bb.3, 14, $noreg
bb.3.middle.block:
liveins: $q0
renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg
tPOP_RET 14, $noreg, def $r4, def $r5, def $r7, def $pc, implicit killed $r0
bb.4:
renamable $r0, dead $cpsr = tMOVi8 0, 14, $noreg
tPOP_RET 14, $noreg, def $r4, def $r5, def $r7, def $pc, implicit killed $r0
...