forked from OSchip/llvm-project
[ARM][AArch64] Add additional test for multiuse vldn binop shuffles. NFC
For D129419, these are the same as the existing test, but run through -early-cse.
This commit is contained in:
parent
109d7fb4e6
commit
6ce63e267a
|
@ -1,5 +1,5 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc < %s -mtriple=aarch64-none-eabif | FileCheck %s
|
||||
; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s
|
||||
|
||||
define void @vld2(float* nocapture readonly %pSrc, float* noalias nocapture %pDst, i32 %numSamples) {
|
||||
; CHECK-LABEL: vld2:
|
||||
|
@ -181,3 +181,164 @@ vector.body: ; preds = %vector.body, %entry
|
|||
while.end: ; preds = %vector.body
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @vld2_multiuse(float* nocapture readonly %pSrc, float* noalias nocapture %pDst, i32 %numSamples) {
|
||||
; CHECK-LABEL: vld2_multiuse:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: mov x8, xzr
|
||||
; CHECK-NEXT: .LBB4_1: // %vector.body
|
||||
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: ldp q1, q0, [x0], #32
|
||||
; CHECK-NEXT: fmul v1.4s, v1.4s, v1.4s
|
||||
; CHECK-NEXT: fmul v0.4s, v0.4s, v0.4s
|
||||
; CHECK-NEXT: uzp1 v2.4s, v1.4s, v0.4s
|
||||
; CHECK-NEXT: uzp2 v0.4s, v1.4s, v0.4s
|
||||
; CHECK-NEXT: fadd v0.4s, v0.4s, v2.4s
|
||||
; CHECK-NEXT: str q0, [x1, x8]
|
||||
; CHECK-NEXT: add x8, x8, #16
|
||||
; CHECK-NEXT: cmp x8, #1, lsl #12 // =4096
|
||||
; CHECK-NEXT: b.ne .LBB4_1
|
||||
; CHECK-NEXT: // %bb.2: // %while.end
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
br label %vector.body
|
||||
|
||||
vector.body: ; preds = %vector.body, %entry
|
||||
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
|
||||
%0 = shl i64 %index, 1
|
||||
%next.gep = getelementptr float, float* %pSrc, i64 %0
|
||||
%next.gep19 = getelementptr float, float* %pDst, i64 %index
|
||||
%1 = bitcast float* %next.gep to <8 x float>*
|
||||
%wide.vec = load <8 x float>, <8 x float>* %1, align 4
|
||||
%2 = fmul fast <8 x float> %wide.vec, %wide.vec
|
||||
%3 = shufflevector <8 x float> %2, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
||||
%4 = shufflevector <8 x float> %2, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
|
||||
%5 = fadd fast <4 x float> %4, %3
|
||||
%6 = bitcast float* %next.gep19 to <4 x float>*
|
||||
store <4 x float> %5, <4 x float>* %6, align 4
|
||||
%index.next = add i64 %index, 4
|
||||
%7 = icmp eq i64 %index.next, 1024
|
||||
br i1 %7, label %while.end, label %vector.body
|
||||
|
||||
while.end: ; preds = %vector.body
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @vld3_multiuse(float* nocapture readonly %pSrc, float* noalias nocapture %pDst, i32 %numSamples) {
|
||||
; CHECK-LABEL: vld3_multiuse:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: mov x8, xzr
|
||||
; CHECK-NEXT: .LBB5_1: // %vector.body
|
||||
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: ldp q0, q1, [x0]
|
||||
; CHECK-NEXT: fmul v0.4s, v0.4s, v0.4s
|
||||
; CHECK-NEXT: fmul v1.4s, v1.4s, v1.4s
|
||||
; CHECK-NEXT: ldr q3, [x0, #32]
|
||||
; CHECK-NEXT: add x0, x0, #48
|
||||
; CHECK-NEXT: mov v2.16b, v0.16b
|
||||
; CHECK-NEXT: mov v2.s[1], v0.s[3]
|
||||
; CHECK-NEXT: rev64 v4.4s, v1.4s
|
||||
; CHECK-NEXT: fmul v3.4s, v3.4s, v3.4s
|
||||
; CHECK-NEXT: mov v2.s[2], v1.s[2]
|
||||
; CHECK-NEXT: mov v4.s[0], v0.s[1]
|
||||
; CHECK-NEXT: mov v1.s[0], v0.s[2]
|
||||
; CHECK-NEXT: mov v2.s[3], v3.s[1]
|
||||
; CHECK-NEXT: mov v4.s[3], v3.s[2]
|
||||
; CHECK-NEXT: mov v1.s[2], v3.s[0]
|
||||
; CHECK-NEXT: fadd v0.4s, v4.4s, v2.4s
|
||||
; CHECK-NEXT: mov v1.s[3], v3.s[3]
|
||||
; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s
|
||||
; CHECK-NEXT: str q0, [x1, x8]
|
||||
; CHECK-NEXT: add x8, x8, #16
|
||||
; CHECK-NEXT: cmp x8, #1, lsl #12 // =4096
|
||||
; CHECK-NEXT: b.ne .LBB5_1
|
||||
; CHECK-NEXT: // %bb.2: // %while.end
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
br label %vector.body
|
||||
|
||||
vector.body: ; preds = %vector.body, %entry
|
||||
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
|
||||
%0 = mul i64 %index, 3
|
||||
%next.gep = getelementptr float, float* %pSrc, i64 %0
|
||||
%next.gep23 = getelementptr float, float* %pDst, i64 %index
|
||||
%1 = bitcast float* %next.gep to <12 x float>*
|
||||
%wide.vec = load <12 x float>, <12 x float>* %1, align 4
|
||||
%2 = fmul fast <12 x float> %wide.vec, %wide.vec
|
||||
%3 = shufflevector <12 x float> %2, <12 x float> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
|
||||
%4 = shufflevector <12 x float> %2, <12 x float> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
|
||||
%5 = fadd fast <4 x float> %4, %3
|
||||
%6 = shufflevector <12 x float> %2, <12 x float> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
|
||||
%7 = fadd fast <4 x float> %5, %6
|
||||
%8 = bitcast float* %next.gep23 to <4 x float>*
|
||||
store <4 x float> %7, <4 x float>* %8, align 4
|
||||
%index.next = add i64 %index, 4
|
||||
%9 = icmp eq i64 %index.next, 1024
|
||||
br i1 %9, label %while.end, label %vector.body
|
||||
|
||||
while.end: ; preds = %vector.body
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @vld4_multiuse(float* nocapture readonly %pSrc, float* noalias nocapture %pDst, i32 %numSamples) {
|
||||
; CHECK-LABEL: vld4_multiuse:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: mov x8, xzr
|
||||
; CHECK-NEXT: .LBB6_1: // %vector.body
|
||||
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: ldp q1, q0, [x0, #32]
|
||||
; CHECK-NEXT: add x9, x1, x8
|
||||
; CHECK-NEXT: add x8, x8, #32
|
||||
; CHECK-NEXT: cmp x8, #2, lsl #12 // =8192
|
||||
; CHECK-NEXT: fmul v1.4s, v1.4s, v1.4s
|
||||
; CHECK-NEXT: ldp q3, q2, [x0], #64
|
||||
; CHECK-NEXT: fmul v0.4s, v0.4s, v0.4s
|
||||
; CHECK-NEXT: fmul v3.4s, v3.4s, v3.4s
|
||||
; CHECK-NEXT: fmul v2.4s, v2.4s, v2.4s
|
||||
; CHECK-NEXT: zip1 v4.4s, v1.4s, v0.4s
|
||||
; CHECK-NEXT: zip2 v5.4s, v1.4s, v0.4s
|
||||
; CHECK-NEXT: uzp2 v16.4s, v3.4s, v2.4s
|
||||
; CHECK-NEXT: ext v6.16b, v1.16b, v4.16b, #8
|
||||
; CHECK-NEXT: trn2 v7.4s, v3.4s, v2.4s
|
||||
; CHECK-NEXT: mov v1.s[3], v0.s[2]
|
||||
; CHECK-NEXT: zip1 v0.4s, v3.4s, v2.4s
|
||||
; CHECK-NEXT: zip2 v2.4s, v3.4s, v2.4s
|
||||
; CHECK-NEXT: uzp2 v3.4s, v16.4s, v3.4s
|
||||
; CHECK-NEXT: mov v7.d[1], v4.d[1]
|
||||
; CHECK-NEXT: mov v0.d[1], v6.d[1]
|
||||
; CHECK-NEXT: mov v2.d[1], v1.d[1]
|
||||
; CHECK-NEXT: mov v3.d[1], v5.d[1]
|
||||
; CHECK-NEXT: fadd v0.4s, v7.4s, v0.4s
|
||||
; CHECK-NEXT: fadd v1.4s, v3.4s, v2.4s
|
||||
; CHECK-NEXT: st2 { v0.4s, v1.4s }, [x9]
|
||||
; CHECK-NEXT: b.ne .LBB6_1
|
||||
; CHECK-NEXT: // %bb.2: // %while.end
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
br label %vector.body
|
||||
|
||||
vector.body: ; preds = %vector.body, %entry
|
||||
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
|
||||
%0 = shl i64 %index, 2
|
||||
%next.gep = getelementptr float, float* %pSrc, i64 %0
|
||||
%1 = shl i64 %index, 1
|
||||
%2 = bitcast float* %next.gep to <16 x float>*
|
||||
%wide.vec = load <16 x float>, <16 x float>* %2, align 4
|
||||
%3 = fmul fast <16 x float> %wide.vec, %wide.vec
|
||||
%4 = shufflevector <16 x float> %3, <16 x float> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
|
||||
%5 = shufflevector <16 x float> %3, <16 x float> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
|
||||
%6 = fadd fast <4 x float> %5, %4
|
||||
%7 = shufflevector <16 x float> %3, <16 x float> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
|
||||
%8 = shufflevector <16 x float> %3, <16 x float> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
|
||||
%9 = fadd fast <4 x float> %8, %7
|
||||
%10 = getelementptr inbounds float, float* %pDst, i64 %1
|
||||
%11 = bitcast float* %10 to <8 x float>*
|
||||
%interleaved.vec = shufflevector <4 x float> %6, <4 x float> %9, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
|
||||
store <8 x float> %interleaved.vec, <8 x float>* %11, align 4
|
||||
%index.next = add i64 %index, 4
|
||||
%12 = icmp eq i64 %index.next, 1024
|
||||
br i1 %12, label %while.end, label %vector.body
|
||||
|
||||
while.end: ; preds = %vector.body
|
||||
ret void
|
||||
}
|
||||
|
|
|
@ -258,3 +258,289 @@ while.body: ; preds = %while.body.preheade
|
|||
while.end: ; preds = %while.body, %middle.block, %entry
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @arm_cmplx_mag_squared_f16_cse(half* nocapture readonly %pSrc, half* nocapture %pDst, i32 %numSamples) {
|
||||
; CHECK-LABEL: arm_cmplx_mag_squared_f16_cse:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r4, r5, r7, lr}
|
||||
; CHECK-NEXT: push {r4, r5, r7, lr}
|
||||
; CHECK-NEXT: cmp r2, #0
|
||||
; CHECK-NEXT: beq .LBB2_8
|
||||
; CHECK-NEXT: @ %bb.1: @ %while.body.preheader
|
||||
; CHECK-NEXT: cmp r2, #8
|
||||
; CHECK-NEXT: blo .LBB2_9
|
||||
; CHECK-NEXT: @ %bb.2: @ %vector.memcheck
|
||||
; CHECK-NEXT: add.w r3, r0, r2, lsl #2
|
||||
; CHECK-NEXT: cmp r3, r1
|
||||
; CHECK-NEXT: itt hi
|
||||
; CHECK-NEXT: addhi.w r3, r1, r2, lsl #1
|
||||
; CHECK-NEXT: cmphi r3, r0
|
||||
; CHECK-NEXT: bhi .LBB2_9
|
||||
; CHECK-NEXT: @ %bb.3: @ %vector.ph
|
||||
; CHECK-NEXT: bic r4, r2, #7
|
||||
; CHECK-NEXT: movs r5, #1
|
||||
; CHECK-NEXT: sub.w r3, r4, #8
|
||||
; CHECK-NEXT: add.w r12, r1, r4, lsl #1
|
||||
; CHECK-NEXT: add.w lr, r5, r3, lsr #3
|
||||
; CHECK-NEXT: add.w r3, r0, r4, lsl #2
|
||||
; CHECK-NEXT: and r5, r2, #7
|
||||
; CHECK-NEXT: .LBB2_4: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrh.u16 q0, [r0], #32
|
||||
; CHECK-NEXT: vmul.f16 q0, q0, q0
|
||||
; CHECK-NEXT: vldrh.u16 q2, [r0, #-16]
|
||||
; CHECK-NEXT: vmovx.f16 s4, s0
|
||||
; CHECK-NEXT: vmovx.f16 s6, s1
|
||||
; CHECK-NEXT: vmul.f16 q2, q2, q2
|
||||
; CHECK-NEXT: vmovx.f16 s5, s2
|
||||
; CHECK-NEXT: vins.f16 s4, s6
|
||||
; CHECK-NEXT: vmovx.f16 s6, s3
|
||||
; CHECK-NEXT: vins.f16 s5, s6
|
||||
; CHECK-NEXT: vmovx.f16 s6, s8
|
||||
; CHECK-NEXT: vmovx.f16 s12, s9
|
||||
; CHECK-NEXT: vmovx.f16 s7, s10
|
||||
; CHECK-NEXT: vins.f16 s6, s12
|
||||
; CHECK-NEXT: vmovx.f16 s12, s11
|
||||
; CHECK-NEXT: vins.f16 s2, s3
|
||||
; CHECK-NEXT: vins.f16 s10, s11
|
||||
; CHECK-NEXT: vins.f16 s8, s9
|
||||
; CHECK-NEXT: vins.f16 s0, s1
|
||||
; CHECK-NEXT: vmov.f32 s1, s2
|
||||
; CHECK-NEXT: vins.f16 s7, s12
|
||||
; CHECK-NEXT: vmov.f32 s2, s8
|
||||
; CHECK-NEXT: vmov.f32 s3, s10
|
||||
; CHECK-NEXT: vadd.f16 q0, q1, q0
|
||||
; CHECK-NEXT: vstrb.8 q0, [r1], #16
|
||||
; CHECK-NEXT: le lr, .LBB2_4
|
||||
; CHECK-NEXT: @ %bb.5: @ %middle.block
|
||||
; CHECK-NEXT: cmp r4, r2
|
||||
; CHECK-NEXT: it eq
|
||||
; CHECK-NEXT: popeq {r4, r5, r7, pc}
|
||||
; CHECK-NEXT: .LBB2_6: @ %while.body.preheader26
|
||||
; CHECK-NEXT: dls lr, r5
|
||||
; CHECK-NEXT: .LBB2_7: @ %while.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldr.16 s0, [r3]
|
||||
; CHECK-NEXT: vldr.16 s2, [r3, #2]
|
||||
; CHECK-NEXT: adds r3, #4
|
||||
; CHECK-NEXT: vmul.f16 s0, s0, s0
|
||||
; CHECK-NEXT: vfma.f16 s0, s2, s2
|
||||
; CHECK-NEXT: vstr.16 s0, [r12]
|
||||
; CHECK-NEXT: add.w r12, r12, #2
|
||||
; CHECK-NEXT: le lr, .LBB2_7
|
||||
; CHECK-NEXT: .LBB2_8: @ %while.end
|
||||
; CHECK-NEXT: pop {r4, r5, r7, pc}
|
||||
; CHECK-NEXT: .LBB2_9:
|
||||
; CHECK-NEXT: mov r3, r0
|
||||
; CHECK-NEXT: mov r12, r1
|
||||
; CHECK-NEXT: mov r5, r2
|
||||
; CHECK-NEXT: b .LBB2_6
|
||||
entry:
|
||||
%cmp.not11 = icmp eq i32 %numSamples, 0
|
||||
br i1 %cmp.not11, label %while.end, label %while.body.preheader
|
||||
|
||||
while.body.preheader: ; preds = %entry
|
||||
%min.iters.check = icmp ult i32 %numSamples, 8
|
||||
br i1 %min.iters.check, label %while.body.preheader26, label %vector.memcheck
|
||||
|
||||
vector.memcheck: ; preds = %while.body.preheader
|
||||
%scevgep = getelementptr half, half* %pDst, i32 %numSamples
|
||||
%0 = shl i32 %numSamples, 1
|
||||
%scevgep18 = getelementptr half, half* %pSrc, i32 %0
|
||||
%bound0 = icmp ugt half* %scevgep18, %pDst
|
||||
%bound1 = icmp ugt half* %scevgep, %pSrc
|
||||
%found.conflict = and i1 %bound0, %bound1
|
||||
br i1 %found.conflict, label %while.body.preheader26, label %vector.ph
|
||||
|
||||
vector.ph: ; preds = %vector.memcheck
|
||||
%n.vec = and i32 %numSamples, -8
|
||||
%1 = shl i32 %n.vec, 1
|
||||
%ind.end = getelementptr half, half* %pSrc, i32 %1
|
||||
%ind.end21 = getelementptr half, half* %pDst, i32 %n.vec
|
||||
%ind.end23 = and i32 %numSamples, 7
|
||||
br label %vector.body
|
||||
|
||||
vector.body: ; preds = %vector.body, %vector.ph
|
||||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%2 = shl i32 %index, 1
|
||||
%next.gep = getelementptr half, half* %pSrc, i32 %2
|
||||
%next.gep24 = getelementptr half, half* %pDst, i32 %index
|
||||
%3 = bitcast half* %next.gep to <16 x half>*
|
||||
%wide.vec = load <16 x half>, <16 x half>* %3, align 2
|
||||
%4 = fmul fast <16 x half> %wide.vec, %wide.vec
|
||||
%5 = shufflevector <16 x half> %4, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
|
||||
%6 = shufflevector <16 x half> %4, <16 x half> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
|
||||
%7 = fadd fast <8 x half> %6, %5
|
||||
%8 = bitcast half* %next.gep24 to <8 x half>*
|
||||
store <8 x half> %7, <8 x half>* %8, align 2
|
||||
%index.next = add i32 %index, 8
|
||||
%9 = icmp eq i32 %index.next, %n.vec
|
||||
br i1 %9, label %middle.block, label %vector.body
|
||||
|
||||
middle.block: ; preds = %vector.body
|
||||
%cmp.n = icmp eq i32 %n.vec, %numSamples
|
||||
br i1 %cmp.n, label %while.end, label %while.body.preheader26
|
||||
|
||||
while.body.preheader26: ; preds = %middle.block, %vector.memcheck, %while.body.preheader
|
||||
%pSrc.addr.014.ph = phi half* [ %pSrc, %vector.memcheck ], [ %pSrc, %while.body.preheader ], [ %ind.end, %middle.block ]
|
||||
%pDst.addr.013.ph = phi half* [ %pDst, %vector.memcheck ], [ %pDst, %while.body.preheader ], [ %ind.end21, %middle.block ]
|
||||
%blkCnt.012.ph = phi i32 [ %numSamples, %vector.memcheck ], [ %numSamples, %while.body.preheader ], [ %ind.end23, %middle.block ]
|
||||
br label %while.body
|
||||
|
||||
while.body: ; preds = %while.body, %while.body.preheader26
|
||||
%pSrc.addr.014 = phi half* [ %incdec.ptr1, %while.body ], [ %pSrc.addr.014.ph, %while.body.preheader26 ]
|
||||
%pDst.addr.013 = phi half* [ %incdec.ptr3, %while.body ], [ %pDst.addr.013.ph, %while.body.preheader26 ]
|
||||
%blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blkCnt.012.ph, %while.body.preheader26 ]
|
||||
%incdec.ptr = getelementptr inbounds half, half* %pSrc.addr.014, i32 1
|
||||
%10 = load half, half* %pSrc.addr.014, align 2
|
||||
%incdec.ptr1 = getelementptr inbounds half, half* %pSrc.addr.014, i32 2
|
||||
%11 = load half, half* %incdec.ptr, align 2
|
||||
%mul = fmul fast half %10, %10
|
||||
%mul2 = fmul fast half %11, %11
|
||||
%add = fadd fast half %mul2, %mul
|
||||
%incdec.ptr3 = getelementptr inbounds half, half* %pDst.addr.013, i32 1
|
||||
store half %add, half* %pDst.addr.013, align 2
|
||||
%dec = add i32 %blkCnt.012, -1
|
||||
%cmp.not = icmp eq i32 %dec, 0
|
||||
br i1 %cmp.not, label %while.end, label %while.body
|
||||
|
||||
while.end: ; preds = %while.body, %middle.block, %entry
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @arm_cmplx_mag_squared_f32_cse(float* nocapture readonly %pSrc, float* nocapture %pDst, i32 %numSamples) {
|
||||
; CHECK-LABEL: arm_cmplx_mag_squared_f32_cse:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r4, r5, r7, lr}
|
||||
; CHECK-NEXT: push {r4, r5, r7, lr}
|
||||
; CHECK-NEXT: cmp r2, #0
|
||||
; CHECK-NEXT: beq .LBB3_8
|
||||
; CHECK-NEXT: @ %bb.1: @ %while.body.preheader
|
||||
; CHECK-NEXT: cmp r2, #4
|
||||
; CHECK-NEXT: blo .LBB3_9
|
||||
; CHECK-NEXT: @ %bb.2: @ %vector.memcheck
|
||||
; CHECK-NEXT: add.w r3, r0, r2, lsl #3
|
||||
; CHECK-NEXT: cmp r3, r1
|
||||
; CHECK-NEXT: itt hi
|
||||
; CHECK-NEXT: addhi.w r3, r1, r2, lsl #2
|
||||
; CHECK-NEXT: cmphi r3, r0
|
||||
; CHECK-NEXT: bhi .LBB3_9
|
||||
; CHECK-NEXT: @ %bb.3: @ %vector.ph
|
||||
; CHECK-NEXT: bic r4, r2, #3
|
||||
; CHECK-NEXT: movs r5, #1
|
||||
; CHECK-NEXT: subs r3, r4, #4
|
||||
; CHECK-NEXT: add.w r12, r1, r4, lsl #2
|
||||
; CHECK-NEXT: add.w lr, r5, r3, lsr #2
|
||||
; CHECK-NEXT: add.w r3, r0, r4, lsl #3
|
||||
; CHECK-NEXT: and r5, r2, #3
|
||||
; CHECK-NEXT: .LBB3_4: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r0], #32
|
||||
; CHECK-NEXT: vmul.f32 q0, q0, q0
|
||||
; CHECK-NEXT: vmul.f32 q1, q1, q1
|
||||
; CHECK-NEXT: vmov.f32 s8, s4
|
||||
; CHECK-NEXT: vmov.f32 s9, s6
|
||||
; CHECK-NEXT: vmov.f32 s4, s5
|
||||
; CHECK-NEXT: vmov.f32 s5, s7
|
||||
; CHECK-NEXT: vmov.f32 s10, s0
|
||||
; CHECK-NEXT: vmov.f32 s11, s2
|
||||
; CHECK-NEXT: vmov.f32 s6, s1
|
||||
; CHECK-NEXT: vmov.f32 s7, s3
|
||||
; CHECK-NEXT: vadd.f32 q0, q1, q2
|
||||
; CHECK-NEXT: vstrb.8 q0, [r1], #16
|
||||
; CHECK-NEXT: le lr, .LBB3_4
|
||||
; CHECK-NEXT: @ %bb.5: @ %middle.block
|
||||
; CHECK-NEXT: cmp r4, r2
|
||||
; CHECK-NEXT: it eq
|
||||
; CHECK-NEXT: popeq {r4, r5, r7, pc}
|
||||
; CHECK-NEXT: .LBB3_6: @ %while.body.preheader26
|
||||
; CHECK-NEXT: dls lr, r5
|
||||
; CHECK-NEXT: .LBB3_7: @ %while.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldr s0, [r3]
|
||||
; CHECK-NEXT: vldr s2, [r3, #4]
|
||||
; CHECK-NEXT: adds r3, #8
|
||||
; CHECK-NEXT: vmul.f32 s0, s0, s0
|
||||
; CHECK-NEXT: vfma.f32 s0, s2, s2
|
||||
; CHECK-NEXT: vstmia r12!, {s0}
|
||||
; CHECK-NEXT: le lr, .LBB3_7
|
||||
; CHECK-NEXT: .LBB3_8: @ %while.end
|
||||
; CHECK-NEXT: pop {r4, r5, r7, pc}
|
||||
; CHECK-NEXT: .LBB3_9:
|
||||
; CHECK-NEXT: mov r3, r0
|
||||
; CHECK-NEXT: mov r12, r1
|
||||
; CHECK-NEXT: mov r5, r2
|
||||
; CHECK-NEXT: b .LBB3_6
|
||||
entry:
|
||||
%cmp.not11 = icmp eq i32 %numSamples, 0
|
||||
br i1 %cmp.not11, label %while.end, label %while.body.preheader
|
||||
|
||||
while.body.preheader: ; preds = %entry
|
||||
%min.iters.check = icmp ult i32 %numSamples, 4
|
||||
br i1 %min.iters.check, label %while.body.preheader26, label %vector.memcheck
|
||||
|
||||
vector.memcheck: ; preds = %while.body.preheader
|
||||
%scevgep = getelementptr float, float* %pDst, i32 %numSamples
|
||||
%0 = shl i32 %numSamples, 1
|
||||
%scevgep18 = getelementptr float, float* %pSrc, i32 %0
|
||||
%bound0 = icmp ugt float* %scevgep18, %pDst
|
||||
%bound1 = icmp ugt float* %scevgep, %pSrc
|
||||
%found.conflict = and i1 %bound0, %bound1
|
||||
br i1 %found.conflict, label %while.body.preheader26, label %vector.ph
|
||||
|
||||
vector.ph: ; preds = %vector.memcheck
|
||||
%n.vec = and i32 %numSamples, -4
|
||||
%1 = shl i32 %n.vec, 1
|
||||
%ind.end = getelementptr float, float* %pSrc, i32 %1
|
||||
%ind.end21 = getelementptr float, float* %pDst, i32 %n.vec
|
||||
%ind.end23 = and i32 %numSamples, 3
|
||||
br label %vector.body
|
||||
|
||||
vector.body: ; preds = %vector.body, %vector.ph
|
||||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%2 = shl i32 %index, 1
|
||||
%next.gep = getelementptr float, float* %pSrc, i32 %2
|
||||
%next.gep24 = getelementptr float, float* %pDst, i32 %index
|
||||
%3 = bitcast float* %next.gep to <8 x float>*
|
||||
%wide.vec = load <8 x float>, <8 x float>* %3, align 4
|
||||
%4 = fmul fast <8 x float> %wide.vec, %wide.vec
|
||||
%5 = shufflevector <8 x float> %4, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
||||
%6 = shufflevector <8 x float> %4, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
|
||||
%7 = fadd fast <4 x float> %6, %5
|
||||
%8 = bitcast float* %next.gep24 to <4 x float>*
|
||||
store <4 x float> %7, <4 x float>* %8, align 4
|
||||
%index.next = add i32 %index, 4
|
||||
%9 = icmp eq i32 %index.next, %n.vec
|
||||
br i1 %9, label %middle.block, label %vector.body
|
||||
|
||||
middle.block: ; preds = %vector.body
|
||||
%cmp.n = icmp eq i32 %n.vec, %numSamples
|
||||
br i1 %cmp.n, label %while.end, label %while.body.preheader26
|
||||
|
||||
while.body.preheader26: ; preds = %middle.block, %vector.memcheck, %while.body.preheader
|
||||
%pSrc.addr.014.ph = phi float* [ %pSrc, %vector.memcheck ], [ %pSrc, %while.body.preheader ], [ %ind.end, %middle.block ]
|
||||
%pDst.addr.013.ph = phi float* [ %pDst, %vector.memcheck ], [ %pDst, %while.body.preheader ], [ %ind.end21, %middle.block ]
|
||||
%blkCnt.012.ph = phi i32 [ %numSamples, %vector.memcheck ], [ %numSamples, %while.body.preheader ], [ %ind.end23, %middle.block ]
|
||||
br label %while.body
|
||||
|
||||
while.body: ; preds = %while.body, %while.body.preheader26
|
||||
%pSrc.addr.014 = phi float* [ %incdec.ptr1, %while.body ], [ %pSrc.addr.014.ph, %while.body.preheader26 ]
|
||||
%pDst.addr.013 = phi float* [ %incdec.ptr3, %while.body ], [ %pDst.addr.013.ph, %while.body.preheader26 ]
|
||||
%blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blkCnt.012.ph, %while.body.preheader26 ]
|
||||
%incdec.ptr = getelementptr inbounds float, float* %pSrc.addr.014, i32 1
|
||||
%10 = load float, float* %pSrc.addr.014, align 4
|
||||
%incdec.ptr1 = getelementptr inbounds float, float* %pSrc.addr.014, i32 2
|
||||
%11 = load float, float* %incdec.ptr, align 4
|
||||
%mul = fmul fast float %10, %10
|
||||
%mul2 = fmul fast float %11, %11
|
||||
%add = fadd fast float %mul2, %mul
|
||||
%incdec.ptr3 = getelementptr inbounds float, float* %pDst.addr.013, i32 1
|
||||
store float %add, float* %pDst.addr.013, align 4
|
||||
%dec = add i32 %blkCnt.012, -1
|
||||
%cmp.not = icmp eq i32 %dec, 0
|
||||
br i1 %cmp.not, label %while.end, label %while.body
|
||||
|
||||
while.end: ; preds = %while.body, %middle.block, %entry
|
||||
ret void
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue