forked from OSchip/llvm-project
2731 lines
79 KiB
LLVM
2731 lines
79 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -march=hexagon -hexagon-hvx-widen=32 < %s | FileCheck %s
|
|
|
|
target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
|
|
target triple = "hexagon"
|
|
|
|
; s8 -> f16
|
|
; No widening
|
|
define void @s8f16_0(ptr %a0, ptr %a1) #0 {
|
|
; CHECK-LABEL: s8f16_0:
|
|
; CHECK: .cfi_startproc
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vmem(r0+#0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r7 = #1
|
|
; CHECK-NEXT: r6 = #64
|
|
; CHECK-NEXT: v1:0.h = vunpack(v0.b)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2.h = vsplat(r7)
|
|
; CHECK-NEXT: r3:2 = combine(#31,#5)
|
|
; CHECK-NEXT: v3.h = vabs(v0.h)
|
|
; CHECK-NEXT: v4.h = vabs(v1.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v8.h = vsplat(r6)
|
|
; CHECK-NEXT: v7.h = vsplat(r3)
|
|
; CHECK-NEXT: v9 = vxor(v9,v9)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r5 = ##32768
|
|
; CHECK-NEXT: v5.uh = vcl0(v3.uh)
|
|
; CHECK-NEXT: q0 = vcmp.gt(v9.h,v0.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v10.h = vsplat(r5)
|
|
; CHECK-NEXT: r4 = #10
|
|
; CHECK-NEXT: v6.uh = vcl0(v4.uh)
|
|
; CHECK-NEXT: v5.h = vadd(v5.h,v2.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v27 = vmux(q0,v10,v9)
|
|
; CHECK-NEXT: v6.h = vadd(v6.h,v2.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v3.h = vasl(v3.h,v5.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v4.h = vasl(v4.h,v6.h)
|
|
; CHECK-NEXT: v13 = vand(v3,v8)
|
|
; CHECK-NEXT: v11.h = vadd(v3.h,v7.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v14.h = vadd(v4.h,v7.h)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v13.h,v9.h)
|
|
; CHECK-NEXT: v8 = vand(v4,v8)
|
|
; CHECK-NEXT: q1 = vcmp.gt(v3.uh,v11.uh)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v11.uh = vlsr(v11.uh,r2)
|
|
; CHECK-NEXT: v13 = vmux(q2,v9,v2)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v8.h,v9.h)
|
|
; CHECK-NEXT: q3 = vcmp.gt(v4.uh,v14.uh)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v20.uh = vlsr(v14.uh,r2)
|
|
; CHECK-NEXT: v22 = vmux(q2,v9,v2)
|
|
; CHECK-NEXT: v21 = vmux(q1,v2,v9)
|
|
; CHECK-NEXT: v2 = vmux(q3,v2,v9)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v19.uh = vlsr(v4.uh,r2)
|
|
; CHECK-NEXT: v13.h = vadd(v11.h,v13.h)
|
|
; CHECK-NEXT: v24.h = vadd(v20.h,v22.h)
|
|
; CHECK-NEXT: v2.h = vadd(v2.h,v7.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v12.uh = vlsr(v3.uh,r2)
|
|
; CHECK-NEXT: v23.h = vadd(v21.h,v7.h)
|
|
; CHECK-NEXT: v2.h = vsub(v2.h,v6.h)
|
|
; CHECK-NEXT: q3 = vcmp.gt(v9.h,v1.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v11.uh = vlsr(v11.uh,r7)
|
|
; CHECK-NEXT: v3.h = vsub(v23.h,v5.h)
|
|
; CHECK-NEXT: q1 = vcmp.eq(v12.h,v11.h)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v19.h,v20.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v25.uh = vlsr(v13.uh,r7)
|
|
; CHECK-NEXT: v28 = vmux(q3,v10,v9)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v0.h,v9.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v26.uh = vlsr(v24.uh,r7)
|
|
; CHECK-NEXT: v5 = vmux(q1,v25,v11)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v4.uh = vlsr(v20.uh,r7)
|
|
; CHECK-NEXT: v5 = vor(v27,v5)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v3.h = vasl(v3.h,r4)
|
|
; CHECK-NEXT: v4 = vmux(q2,v26,v4)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v1.h,v9.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2.h = vasl(v2.h,r4)
|
|
; CHECK-NEXT: v4 = vor(v28,v4)
|
|
; CHECK-NEXT: v29 = vor(v5,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2 = vor(v4,v2)
|
|
; CHECK-NEXT: v31 = vmux(q3,v9,v29)
|
|
; CHECK-NEXT: vmem(r1+#0) = v31.new
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v30 = vmux(q2,v9,v2)
|
|
; CHECK-NEXT: jumpr r31
|
|
; CHECK-NEXT: vmem(r1+#1) = v30.new
|
|
; CHECK-NEXT: }
|
|
%v0 = load <128 x i8>, ptr %a0, align 128
|
|
%v1 = sitofp <128 x i8> %v0 to <128 x half>
|
|
store <128 x half> %v1, ptr %a1, align 128
|
|
ret void
|
|
}
|
|
|
|
; Widen input
|
|
define void @s8f16_1(ptr %a0, ptr %a1) #0 {
|
|
; CHECK-LABEL: s8f16_1:
|
|
; CHECK: .cfi_startproc
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vmem(r0+#0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r6 = #1
|
|
; CHECK-NEXT: r3:2 = combine(#64,#31)
|
|
; CHECK-NEXT: v1:0.h = vunpack(v0.b)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v3.h = vsplat(r6)
|
|
; CHECK-NEXT: v4.h = vsplat(r2)
|
|
; CHECK-NEXT: v2.h = vabs(v0.h)
|
|
; CHECK-NEXT: v1 = vxor(v1,v1)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.h = vsplat(r3)
|
|
; CHECK-NEXT: r5:4 = combine(##32768,#5)
|
|
; CHECK-NEXT: r2 = #10
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v8.h = vsplat(r5)
|
|
; CHECK-NEXT: v5.uh = vcl0(v2.uh)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v0.h,v1.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v5.h = vadd(v5.h,v3.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2.h = vasl(v2.h,v5.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7.h = vadd(v2.h,v4.h)
|
|
; CHECK-NEXT: v6 = vand(v2,v6)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2.uh = vlsr(v2.uh,r4)
|
|
; CHECK-NEXT: q0 = vcmp.eq(v6.h,v1.h)
|
|
; CHECK-NEXT: q1 = vcmp.gt(v2.uh,v7.uh)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v25.uh = vlsr(v7.uh,r4)
|
|
; CHECK-NEXT: v26 = vmux(q0,v1,v3)
|
|
; CHECK-NEXT: v3 = vmux(q1,v3,v1)
|
|
; CHECK-NEXT: q1 = vcmp.gt(v1.h,v0.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7.h = vadd(v25.h,v26.h)
|
|
; CHECK-NEXT: v3.h = vadd(v3.h,v4.h)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v2.h,v25.h)
|
|
; CHECK-NEXT: v30 = vmux(q1,v8,v1)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v27.uh = vlsr(v25.uh,r6)
|
|
; CHECK-NEXT: v28.h = vsub(v3.h,v5.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v29.uh = vlsr(v7.uh,r6)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2.h = vasl(v28.h,r2)
|
|
; CHECK-NEXT: v3 = vmux(q2,v29,v27)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v3 = vor(v30,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v31 = vor(v3,v2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vmux(q3,v1,v31)
|
|
; CHECK-NEXT: jumpr r31
|
|
; CHECK-NEXT: vmem(r1+#0) = v0.new
|
|
; CHECK-NEXT: }
|
|
%v0 = load <64 x i8>, ptr %a0, align 128
|
|
%v1 = sitofp <64 x i8> %v0 to <64 x half>
|
|
store <64 x half> %v1, ptr %a1, align 128
|
|
ret void
|
|
}
|
|
|
|
|
|
; s8 -> f32
|
|
; No widening
|
|
define void @s8f32_0(ptr %a0, ptr %a1) #0 {
|
|
; CHECK-LABEL: s8f32_0:
|
|
; CHECK: .cfi_startproc
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r3:2 = combine(##.LCPI2_0,#8)
|
|
; CHECK-NEXT: v3:2.h = vunpack(v1.b)
|
|
; CHECK-NEXT: v1.cur = vmem(r0+#0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r0 = #1
|
|
; CHECK-NEXT: r7 = #512
|
|
; CHECK-NEXT: r4 = #255
|
|
; CHECK-NEXT: v3 = vmem(r3+#0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1 = vsplat(r0)
|
|
; CHECK-NEXT: v13 = vsplat(r7)
|
|
; CHECK-NEXT: v4 = vdelta(v1,v3)
|
|
; CHECK-NEXT: v0 = vxor(v0,v0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v10 = vsplat(r4)
|
|
; CHECK-NEXT: r6 = ##-2147483648
|
|
; CHECK-NEXT: v3:2.w = vunpack(v2.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v15 = vsplat(r6)
|
|
; CHECK-NEXT: r5 = #159
|
|
; CHECK-NEXT: v5:4.h = vunpack(v4.b)
|
|
; CHECK-NEXT: v6.w = vabs(v3.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v17 = vsplat(r5)
|
|
; CHECK-NEXT: r4 = #23
|
|
; CHECK-NEXT: v8.w = vabs(v2.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v5:4.w = vunpack(v4.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v9.uw = vcl0(v6.uw)
|
|
; CHECK-NEXT: v7.w = vabs(v4.w)
|
|
; CHECK-NEXT: v11.w = vabs(v5.w)
|
|
; CHECK-NEXT: q0 = vcmp.gt(v0.w,v4.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v14.uw = vcl0(v8.uw)
|
|
; CHECK-NEXT: v9.w = vadd(v9.w,v1.w)
|
|
; CHECK-NEXT: v18 = vmux(q0,v15,v0)
|
|
; CHECK-NEXT: q1 = vcmp.gt(v0.w,v5.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v12.uw = vcl0(v7.uw)
|
|
; CHECK-NEXT: v14.w = vadd(v14.w,v1.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v16.uw = vcl0(v11.uw)
|
|
; CHECK-NEXT: v12.w = vadd(v12.w,v1.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.w = vasl(v6.w,v9.w)
|
|
; CHECK-NEXT: v16.w = vadd(v16.w,v1.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7.w = vasl(v7.w,v12.w)
|
|
; CHECK-NEXT: v19 = vand(v6,v13)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v11.w = vasl(v11.w,v16.w)
|
|
; CHECK-NEXT: v21 = vand(v7,v13)
|
|
; CHECK-NEXT: v31.w = vadd(v7.w,v10.w)
|
|
; CHECK-NEXT: q0 = vcmp.eq(v19.w,v0.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v8.w = vasl(v8.w,v14.w)
|
|
; CHECK-NEXT: v22.w = vadd(v11.w,v10.w)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v21.w,v0.w)
|
|
; CHECK-NEXT: v24 = vand(v11,v13)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v23.uw = vlsr(v31.uw,r2)
|
|
; CHECK-NEXT: v29 = vmux(q3,v0,v1)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v24.w,v0.w)
|
|
; CHECK-NEXT: q2 = vcmp.gt(v7.uw,v31.uw)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v28.uw = vlsr(v11.uw,r2)
|
|
; CHECK-NEXT: v27 = vmux(q3,v0,v1)
|
|
; CHECK-NEXT: v19.w = vadd(v23.w,v29.w)
|
|
; CHECK-NEXT: v31 = vmux(q2,v1,v0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v30.uw = vlsr(v22.uw,r2)
|
|
; CHECK-NEXT: v13 = vand(v8,v13)
|
|
; CHECK-NEXT: v26 = vmux(q0,v0,v1)
|
|
; CHECK-NEXT: v12.w = vsub(v31.w,v12.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v20.uw = vlsr(v7.uw,r2)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v28.w,v30.w)
|
|
; CHECK-NEXT: v28.w = vadd(v30.w,v27.w)
|
|
; CHECK-NEXT: v31 = vmux(q1,v15,v0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v29.uw = vlsr(v30.uw,r0)
|
|
; CHECK-NEXT: v30.w = vadd(v6.w,v10.w)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v20.w,v23.w)
|
|
; CHECK-NEXT: v10.w = vadd(v8.w,v10.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7.uw = vlsr(v28.uw,r0)
|
|
; CHECK-NEXT: q0 = vcmp.gt(v8.uw,v10.uw)
|
|
; CHECK-NEXT: v12.w = vadd(v12.w,v17.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v23.uw = vlsr(v23.uw,r0)
|
|
; CHECK-NEXT: v7 = vmux(q3,v7,v29)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v13.w,v0.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v19.uw = vlsr(v19.uw,r0)
|
|
; CHECK-NEXT: v29 = vmux(q3,v0,v1)
|
|
; CHECK-NEXT: v7 = vor(v31,v7)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v25.uw = vlsr(v30.uw,r2)
|
|
; CHECK-NEXT: v19 = vmux(q2,v19,v23)
|
|
; CHECK-NEXT: q2 = vcmp.gt(v11.uw,v22.uw)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v10.uw = vlsr(v10.uw,r2)
|
|
; CHECK-NEXT: v27 = vmux(q2,v1,v0)
|
|
; CHECK-NEXT: q2 = vcmp.gt(v6.uw,v30.uw)
|
|
; CHECK-NEXT: v28.w = vadd(v25.w,v26.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.uw = vlsr(v6.uw,r2)
|
|
; CHECK-NEXT: v31 = vmux(q2,v1,v0)
|
|
; CHECK-NEXT: v1 = vmux(q0,v1,v0)
|
|
; CHECK-NEXT: v30.w = vadd(v10.w,v29.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v24.uw = vlsr(v8.uw,r2)
|
|
; CHECK-NEXT: v1.w = vsub(v1.w,v14.w)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v6.w,v25.w)
|
|
; CHECK-NEXT: v21.w = vsub(v31.w,v9.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v8.uw = vlsr(v28.uw,r0)
|
|
; CHECK-NEXT: v6.w = vadd(v21.w,v17.w)
|
|
; CHECK-NEXT: v1.w = vadd(v1.w,v17.w)
|
|
; CHECK-NEXT: q0 = vcmp.eq(v24.w,v10.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v22.uw = vlsr(v25.uw,r0)
|
|
; CHECK-NEXT: v13.w = vsub(v27.w,v16.w)
|
|
; CHECK-NEXT: q2 = vcmp.gt(v0.w,v3.w)
|
|
; CHECK-NEXT: v18 = vor(v18,v19)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v23.uw = vlsr(v30.uw,r0)
|
|
; CHECK-NEXT: v8 = vmux(q3,v8,v22)
|
|
; CHECK-NEXT: q3 = vcmp.gt(v0.w,v2.w)
|
|
; CHECK-NEXT: v26 = vmux(q2,v15,v0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v24.uw = vlsr(v10.uw,r0)
|
|
; CHECK-NEXT: v25.w = vadd(v13.w,v17.w)
|
|
; CHECK-NEXT: v27 = vmux(q3,v15,v0)
|
|
; CHECK-NEXT: v8 = vor(v26,v8)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.w = vasl(v6.w,r4)
|
|
; CHECK-NEXT: v9 = vmux(q0,v23,v24)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v3.w,v0.w)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v2.w,v0.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.w = vasl(v1.w,r4)
|
|
; CHECK-NEXT: v9 = vor(v27,v9)
|
|
; CHECK-NEXT: v6 = vor(v8,v6)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v12.w = vasl(v12.w,r4)
|
|
; CHECK-NEXT: v1 = vor(v9,v1)
|
|
; CHECK-NEXT: v29 = vmux(q2,v0,v6)
|
|
; CHECK-NEXT: vmem(r1+#1) = v29.new
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v28.w = vasl(v25.w,r4)
|
|
; CHECK-NEXT: v1 = vmux(q3,v0,v1)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v5.w,v0.w)
|
|
; CHECK-NEXT: vmem(r1+#0) = v1.new
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v30 = vor(v7,v28)
|
|
; CHECK-NEXT: v31 = vor(v18,v12)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v4.w,v0.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2 = vmux(q2,v0,v30)
|
|
; CHECK-NEXT: v0 = vmux(q3,v0,v31)
|
|
; CHECK-NEXT: vmem(r1+#3) = v2.new
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: jumpr r31
|
|
; CHECK-NEXT: vmem(r1+#2) = v0
|
|
; CHECK-NEXT: }
|
|
%v0 = load <128 x i8>, ptr %a0, align 128
|
|
%v1 = sitofp <128 x i8> %v0 to <128 x float>
|
|
store <128 x float> %v1, ptr %a1, align 128
|
|
ret void
|
|
}
|
|
|
|
; Widen input #1
|
|
define void @s8f32_1(ptr %a0, ptr %a1) #0 {
|
|
; CHECK-LABEL: s8f32_1:
|
|
; CHECK: .cfi_startproc
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r0 = #1
|
|
; CHECK-NEXT: v3:2.h = vunpack(v0.b)
|
|
; CHECK-NEXT: v0.cur = vmem(r0+#0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1 = vsplat(r0)
|
|
; CHECK-NEXT: r3:2 = combine(##255,#8)
|
|
; CHECK-NEXT: r6 = #512
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7 = vsplat(r3)
|
|
; CHECK-NEXT: v3:2.w = vunpack(v2.h)
|
|
; CHECK-NEXT: v22 = vxor(v22,v22)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v10 = vsplat(r6)
|
|
; CHECK-NEXT: r7 = ##-2147483648
|
|
; CHECK-NEXT: r5 = #159
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v9 = vsplat(r7)
|
|
; CHECK-NEXT: v4.w = vabs(v2.w)
|
|
; CHECK-NEXT: v5.w = vabs(v3.w)
|
|
; CHECK-NEXT: q0 = vcmp.gt(v22.w,v2.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v12 = vsplat(r5)
|
|
; CHECK-NEXT: r4 = #23
|
|
; CHECK-NEXT: v11 = vmux(q0,v9,v22)
|
|
; CHECK-NEXT: q0 = vcmp.gt(v22.w,v3.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.uw = vcl0(v4.uw)
|
|
; CHECK-NEXT: v30 = vmux(q0,v9,v22)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v8.uw = vcl0(v5.uw)
|
|
; CHECK-NEXT: v6.w = vadd(v6.w,v1.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v8.w = vadd(v8.w,v1.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v4.w = vasl(v4.w,v6.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v5.w = vasl(v5.w,v8.w)
|
|
; CHECK-NEXT: v13 = vand(v4,v10)
|
|
; CHECK-NEXT: v14.w = vadd(v4.w,v7.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v10 = vand(v5,v10)
|
|
; CHECK-NEXT: v7.w = vadd(v5.w,v7.w)
|
|
; CHECK-NEXT: q2 = vcmp.gt(v4.uw,v14.uw)
|
|
; CHECK-NEXT: q1 = vcmp.eq(v13.w,v22.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v14.uw = vlsr(v14.uw,r2)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v10.w,v22.w)
|
|
; CHECK-NEXT: v25 = vmux(q2,v1,v22)
|
|
; CHECK-NEXT: q2 = vcmp.gt(v5.uw,v7.uw)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7.uw = vlsr(v7.uw,r2)
|
|
; CHECK-NEXT: v26 = vmux(q1,v22,v1)
|
|
; CHECK-NEXT: v27 = vmux(q3,v22,v1)
|
|
; CHECK-NEXT: v1 = vmux(q2,v1,v22)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v24.uw = vlsr(v5.uw,r2)
|
|
; CHECK-NEXT: v5.w = vadd(v14.w,v26.w)
|
|
; CHECK-NEXT: v29.w = vadd(v7.w,v27.w)
|
|
; CHECK-NEXT: v6.w = vsub(v25.w,v6.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v23.uw = vlsr(v4.uw,r2)
|
|
; CHECK-NEXT: v1.w = vsub(v1.w,v8.w)
|
|
; CHECK-NEXT: v6.w = vadd(v6.w,v12.w)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v24.w,v7.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v28.uw = vlsr(v14.uw,r0)
|
|
; CHECK-NEXT: v1.w = vadd(v1.w,v12.w)
|
|
; CHECK-NEXT: q1 = vcmp.eq(v23.w,v14.w)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v3.w,v22.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v5.uw = vlsr(v5.uw,r0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7.uw = vlsr(v7.uw,r0)
|
|
; CHECK-NEXT: v5 = vmux(q1,v5,v28)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v4.uw = vlsr(v29.uw,r0)
|
|
; CHECK-NEXT: v5 = vor(v11,v5)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.w = vasl(v6.w,r4)
|
|
; CHECK-NEXT: v4 = vmux(q3,v4,v7)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v2.w,v22.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.w = vasl(v1.w,r4)
|
|
; CHECK-NEXT: v4 = vor(v30,v4)
|
|
; CHECK-NEXT: v31 = vor(v5,v6)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1 = vor(v4,v1)
|
|
; CHECK-NEXT: v0 = vmux(q3,v22,v31)
|
|
; CHECK-NEXT: vmem(r1+#0) = v0.new
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1 = vmux(q2,v22,v1)
|
|
; CHECK-NEXT: jumpr r31
|
|
; CHECK-NEXT: vmem(r1+#1) = v1.new
|
|
; CHECK-NEXT: }
|
|
%v0 = load <64 x i8>, ptr %a0, align 128
|
|
%v1 = sitofp <64 x i8> %v0 to <64 x float>
|
|
store <64 x float> %v1, ptr %a1, align 128
|
|
ret void
|
|
}
|
|
|
|
; Widen input #2
|
|
define void @s8f32_2(ptr %a0, ptr %a1) #0 {
|
|
; CHECK-LABEL: s8f32_2:
|
|
; CHECK: .cfi_startproc
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vmem(r0+#0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r0 = #1
|
|
; CHECK-NEXT: r3 = #512
|
|
; CHECK-NEXT: v1:0.h = vunpack(v0.b)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2 = vsplat(r0)
|
|
; CHECK-NEXT: v4 = vsplat(r3)
|
|
; CHECK-NEXT: r2 = #255
|
|
; CHECK-NEXT: v3 = vxor(v3,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r7:6 = combine(##-2147483648,#8)
|
|
; CHECK-NEXT: r4 = #159
|
|
; CHECK-NEXT: v1:0.w = vunpack(v0.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1 = vsplat(r2)
|
|
; CHECK-NEXT: v8 = vsplat(r4)
|
|
; CHECK-NEXT: v5.w = vabs(v0.w)
|
|
; CHECK-NEXT: q2 = vcmp.gt(v3.w,v0.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7 = vsplat(r7)
|
|
; CHECK-NEXT: r2 = #23
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.uw = vcl0(v5.uw)
|
|
; CHECK-NEXT: v30 = vmux(q2,v7,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.w = vadd(v6.w,v2.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v5.w = vasl(v5.w,v6.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.w = vadd(v5.w,v1.w)
|
|
; CHECK-NEXT: v4 = vand(v5,v4)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v5.uw = vlsr(v5.uw,r6)
|
|
; CHECK-NEXT: q0 = vcmp.eq(v4.w,v3.w)
|
|
; CHECK-NEXT: q1 = vcmp.gt(v5.uw,v1.uw)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.uw = vlsr(v1.uw,r6)
|
|
; CHECK-NEXT: v4 = vmux(q0,v3,v2)
|
|
; CHECK-NEXT: v2 = vmux(q1,v2,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v4.w = vadd(v1.w,v4.w)
|
|
; CHECK-NEXT: v2.w = vsub(v2.w,v6.w)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v5.w,v1.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v28.uw = vlsr(v1.uw,r0)
|
|
; CHECK-NEXT: v2.w = vadd(v2.w,v8.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v29.uw = vlsr(v4.uw,r0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2.w = vasl(v2.w,r2)
|
|
; CHECK-NEXT: v1 = vmux(q3,v29,v28)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1 = vor(v30,v1)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v31 = vor(v1,v2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vmux(q3,v3,v31)
|
|
; CHECK-NEXT: jumpr r31
|
|
; CHECK-NEXT: vmem(r1+#0) = v0.new
|
|
; CHECK-NEXT: }
|
|
%v0 = load <32 x i8>, ptr %a0, align 128
|
|
%v1 = sitofp <32 x i8> %v0 to <32 x float>
|
|
store <32 x float> %v1, ptr %a1, align 128
|
|
ret void
|
|
}
|
|
|
|
|
|
; s16 -> f16
|
|
; No widening
|
|
define void @s16f16_0(ptr %a0, ptr %a1) #0 {
|
|
; CHECK-LABEL: s16f16_0:
|
|
; CHECK: .cfi_startproc
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r6 = #1
|
|
; CHECK-NEXT: r3:2 = combine(#64,#31)
|
|
; CHECK-NEXT: v1.h = vabs(v0.h)
|
|
; CHECK-NEXT: v0.cur = vmem(r0+#0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v3.h = vsplat(r6)
|
|
; CHECK-NEXT: v5.h = vsplat(r2)
|
|
; CHECK-NEXT: v2 = vxor(v2,v2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.h = vsplat(r3)
|
|
; CHECK-NEXT: r5:4 = combine(##32768,#5)
|
|
; CHECK-NEXT: v4.uh = vcl0(v1.uh)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v8.h = vsplat(r5)
|
|
; CHECK-NEXT: r2 = #10
|
|
; CHECK-NEXT: v4.h = vadd(v4.h,v3.h)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v0.h,v2.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.h = vasl(v1.h,v4.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7.h = vadd(v1.h,v5.h)
|
|
; CHECK-NEXT: v6 = vand(v1,v6)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.uh = vlsr(v1.uh,r4)
|
|
; CHECK-NEXT: q0 = vcmp.eq(v6.h,v2.h)
|
|
; CHECK-NEXT: q1 = vcmp.gt(v1.uh,v7.uh)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v25.uh = vlsr(v7.uh,r4)
|
|
; CHECK-NEXT: v26 = vmux(q0,v2,v3)
|
|
; CHECK-NEXT: v3 = vmux(q1,v3,v2)
|
|
; CHECK-NEXT: q1 = vcmp.gt(v2.h,v0.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7.h = vadd(v25.h,v26.h)
|
|
; CHECK-NEXT: v3.h = vadd(v3.h,v5.h)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v1.h,v25.h)
|
|
; CHECK-NEXT: v30 = vmux(q1,v8,v2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v27.uh = vlsr(v25.uh,r6)
|
|
; CHECK-NEXT: v28.h = vsub(v3.h,v4.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v29.uh = vlsr(v7.uh,r6)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.h = vasl(v28.h,r2)
|
|
; CHECK-NEXT: v3 = vmux(q2,v29,v27)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v3 = vor(v30,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v31 = vor(v3,v1)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vmux(q3,v2,v31)
|
|
; CHECK-NEXT: jumpr r31
|
|
; CHECK-NEXT: vmem(r1+#0) = v0.new
|
|
; CHECK-NEXT: }
|
|
%v0 = load <64 x i16>, ptr %a0, align 128
|
|
%v1 = sitofp <64 x i16> %v0 to <64 x half>
|
|
store <64 x half> %v1, ptr %a1, align 128
|
|
ret void
|
|
}
|
|
|
|
; Widen input and result
|
|
define void @s16f16_1(ptr %a0, ptr %a1) #0 {
|
|
; CHECK-LABEL: s16f16_1:
|
|
; CHECK: .cfi_startproc
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r7 = #1
|
|
; CHECK-NEXT: r3:2 = combine(#31,#64)
|
|
; CHECK-NEXT: v1.h = vabs(v0.h)
|
|
; CHECK-NEXT: v0.cur = vmem(r0+#0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2.h = vsplat(r7)
|
|
; CHECK-NEXT: v5.h = vsplat(r3)
|
|
; CHECK-NEXT: r6 = #5
|
|
; CHECK-NEXT: v3 = vxor(v3,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.h = vsplat(r2)
|
|
; CHECK-NEXT: r4 = ##32768
|
|
; CHECK-NEXT: v4.uh = vcl0(v1.uh)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v8.h = vsplat(r4)
|
|
; CHECK-NEXT: r3 = #10
|
|
; CHECK-NEXT: q2 = vcmp.gt(v3.h,v0.h)
|
|
; CHECK-NEXT: v4.h = vadd(v4.h,v2.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v30 = vmux(q2,v8,v3)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v0.h,v3.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.h = vasl(v1.h,v4.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7.h = vadd(v1.h,v5.h)
|
|
; CHECK-NEXT: v6 = vand(v1,v6)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.uh = vlsr(v1.uh,r6)
|
|
; CHECK-NEXT: q1 = vcmp.eq(v6.h,v3.h)
|
|
; CHECK-NEXT: q0 = vcmp.gt(v1.uh,v7.uh)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v25.uh = vlsr(v7.uh,r6)
|
|
; CHECK-NEXT: v26 = vmux(q1,v3,v2)
|
|
; CHECK-NEXT: v2 = vmux(q0,v2,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7.h = vadd(v25.h,v26.h)
|
|
; CHECK-NEXT: v2.h = vadd(v2.h,v5.h)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v1.h,v25.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v27.uh = vlsr(v25.uh,r7)
|
|
; CHECK-NEXT: v28.h = vsub(v2.h,v4.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v29.uh = vlsr(v7.uh,r7)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.h = vasl(v28.h,r3)
|
|
; CHECK-NEXT: q3 = vsetq(r2)
|
|
; CHECK-NEXT: v2 = vmux(q3,v29,v27)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2 = vor(v30,v2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v31 = vor(v2,v1)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vmux(q2,v3,v31)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: jumpr r31
|
|
; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
|
|
; CHECK-NEXT: }
|
|
%v0 = load <32 x i16>, ptr %a0, align 128
|
|
%v1 = sitofp <32 x i16> %v0 to <32 x half>
|
|
store <32 x half> %v1, ptr %a1, align 128
|
|
ret void
|
|
}
|
|
|
|
|
|
; s16 -> f32
|
|
; No widening
|
|
define void @s16f32_0(ptr %a0, ptr %a1) #0 {
|
|
; CHECK-LABEL: s16f32_0:
|
|
; CHECK: .cfi_startproc
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vmem(r0+#0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r0 = #1
|
|
; CHECK-NEXT: r3:2 = combine(##255,#8)
|
|
; CHECK-NEXT: v1:0.w = vunpack(v0.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v3 = vsplat(r0)
|
|
; CHECK-NEXT: r7 = #512
|
|
; CHECK-NEXT: v4.w = vabs(v0.w)
|
|
; CHECK-NEXT: v6.w = vabs(v1.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v5 = vsplat(r3)
|
|
; CHECK-NEXT: v9 = vsplat(r7)
|
|
; CHECK-NEXT: r5 = #159
|
|
; CHECK-NEXT: v2 = vxor(v2,v2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v13 = vsplat(r5)
|
|
; CHECK-NEXT: r6 = ##-2147483648
|
|
; CHECK-NEXT: v7.uw = vcl0(v4.uw)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v10 = vsplat(r6)
|
|
; CHECK-NEXT: v8.uw = vcl0(v6.uw)
|
|
; CHECK-NEXT: q0 = vcmp.gt(v2.w,v0.w)
|
|
; CHECK-NEXT: v7.w = vadd(v7.w,v3.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r4 = #23
|
|
; CHECK-NEXT: v8.w = vadd(v8.w,v3.w)
|
|
; CHECK-NEXT: v27 = vmux(q0,v10,v2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v4.w = vasl(v4.w,v7.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.w = vasl(v6.w,v8.w)
|
|
; CHECK-NEXT: v11.w = vadd(v4.w,v5.w)
|
|
; CHECK-NEXT: v12 = vand(v4,v9)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v5.w = vadd(v6.w,v5.w)
|
|
; CHECK-NEXT: v9 = vand(v6,v9)
|
|
; CHECK-NEXT: q1 = vcmp.eq(v12.w,v2.w)
|
|
; CHECK-NEXT: q2 = vcmp.gt(v4.uw,v11.uw)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v22.uw = vlsr(v11.uw,r2)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v9.w,v2.w)
|
|
; CHECK-NEXT: v23 = vmux(q1,v2,v3)
|
|
; CHECK-NEXT: v14 = vmux(q2,v3,v2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v24.uw = vlsr(v5.uw,r2)
|
|
; CHECK-NEXT: v11.w = vadd(v22.w,v23.w)
|
|
; CHECK-NEXT: q2 = vcmp.gt(v6.uw,v5.uw)
|
|
; CHECK-NEXT: v25 = vmux(q3,v2,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v21.uw = vlsr(v4.uw,r2)
|
|
; CHECK-NEXT: v5.w = vadd(v24.w,v25.w)
|
|
; CHECK-NEXT: v3 = vmux(q2,v3,v2)
|
|
; CHECK-NEXT: v7.w = vsub(v14.w,v7.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.uw = vlsr(v6.uw,r2)
|
|
; CHECK-NEXT: v3.w = vsub(v3.w,v8.w)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v21.w,v22.w)
|
|
; CHECK-NEXT: v7.w = vadd(v7.w,v13.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v4.uw = vlsr(v22.uw,r0)
|
|
; CHECK-NEXT: v3.w = vadd(v3.w,v13.w)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v6.w,v24.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v11.uw = vlsr(v11.uw,r0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v5.uw = vlsr(v5.uw,r0)
|
|
; CHECK-NEXT: v4 = vmux(q3,v11,v4)
|
|
; CHECK-NEXT: q3 = vcmp.gt(v2.w,v1.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v26.uw = vlsr(v24.uw,r0)
|
|
; CHECK-NEXT: v28 = vmux(q3,v10,v2)
|
|
; CHECK-NEXT: v4 = vor(v27,v4)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v0.w,v2.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7.w = vasl(v7.w,r4)
|
|
; CHECK-NEXT: v5 = vmux(q2,v5,v26)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v1.w,v2.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v3.w = vasl(v3.w,r4)
|
|
; CHECK-NEXT: v5 = vor(v28,v5)
|
|
; CHECK-NEXT: v29 = vor(v4,v7)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v3 = vor(v5,v3)
|
|
; CHECK-NEXT: v31 = vmux(q3,v2,v29)
|
|
; CHECK-NEXT: vmem(r1+#0) = v31.new
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v30 = vmux(q2,v2,v3)
|
|
; CHECK-NEXT: jumpr r31
|
|
; CHECK-NEXT: vmem(r1+#1) = v30.new
|
|
; CHECK-NEXT: }
|
|
%v0 = load <64 x i16>, ptr %a0, align 128
|
|
%v1 = sitofp <64 x i16> %v0 to <64 x float>
|
|
store <64 x float> %v1, ptr %a1, align 128
|
|
ret void
|
|
}
|
|
|
|
; Widen input
|
|
define void @s16f32_1(ptr %a0, ptr %a1) #0 {
|
|
; CHECK-LABEL: s16f32_1:
|
|
; CHECK: .cfi_startproc
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vmem(r0+#0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r0 = #1
|
|
; CHECK-NEXT: r2 = #255
|
|
; CHECK-NEXT: v1:0.w = vunpack(v0.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v3 = vsplat(r0)
|
|
; CHECK-NEXT: v4 = vsplat(r2)
|
|
; CHECK-NEXT: r3 = #512
|
|
; CHECK-NEXT: v2.w = vabs(v0.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6 = vsplat(r3)
|
|
; CHECK-NEXT: r7:6 = combine(##-2147483648,#8)
|
|
; CHECK-NEXT: v1 = vxor(v1,v1)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r4 = #159
|
|
; CHECK-NEXT: v5.uw = vcl0(v2.uw)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7 = vsplat(r4)
|
|
; CHECK-NEXT: v29 = vsplat(r7)
|
|
; CHECK-NEXT: q2 = vcmp.gt(v1.w,v0.w)
|
|
; CHECK-NEXT: v5.w = vadd(v5.w,v3.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r2 = #23
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2.w = vasl(v2.w,v5.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v4.w = vadd(v2.w,v4.w)
|
|
; CHECK-NEXT: v6 = vand(v2,v6)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2.uw = vlsr(v2.uw,r6)
|
|
; CHECK-NEXT: q0 = vcmp.eq(v6.w,v1.w)
|
|
; CHECK-NEXT: q1 = vcmp.gt(v2.uw,v4.uw)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v4.uw = vlsr(v4.uw,r6)
|
|
; CHECK-NEXT: v6 = vmux(q0,v1,v3)
|
|
; CHECK-NEXT: v3 = vmux(q1,v3,v1)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.w = vadd(v4.w,v6.w)
|
|
; CHECK-NEXT: v27.w = vsub(v3.w,v5.w)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v2.w,v4.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v28.uw = vlsr(v4.uw,r0)
|
|
; CHECK-NEXT: v2.w = vadd(v27.w,v7.w)
|
|
; CHECK-NEXT: v4 = vmux(q2,v29,v1)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v30.uw = vlsr(v6.uw,r0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2.w = vasl(v2.w,r2)
|
|
; CHECK-NEXT: v3 = vmux(q3,v30,v28)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v0.w,v1.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v3 = vor(v4,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v31 = vor(v3,v2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vmux(q3,v1,v31)
|
|
; CHECK-NEXT: jumpr r31
|
|
; CHECK-NEXT: vmem(r1+#0) = v0.new
|
|
; CHECK-NEXT: }
|
|
%v0 = load <32 x i16>, ptr %a0, align 128
|
|
%v1 = sitofp <32 x i16> %v0 to <32 x float>
|
|
store <32 x float> %v1, ptr %a1, align 128
|
|
ret void
|
|
}
|
|
|
|
|
|
; s32 -> f16
|
|
; No widening
|
|
define void @s32f16_0(ptr %a0, ptr %a1) #0 {
|
|
; CHECK-LABEL: s32f16_0:
|
|
; CHECK: .cfi_startproc
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r3:2 = combine(#8,#1)
|
|
; CHECK-NEXT: r6 = #255
|
|
; CHECK-NEXT: v2.w = vabs(v1.w)
|
|
; CHECK-NEXT: v1.cur = vmem(r0+#0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v4 = vsplat(r2)
|
|
; CHECK-NEXT: r4 = #512
|
|
; CHECK-NEXT: v3.w = vabs(v0.w)
|
|
; CHECK-NEXT: v0.cur = vmem(r0+#1)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v9 = vsplat(r4)
|
|
; CHECK-NEXT: v8 = vsplat(r6)
|
|
; CHECK-NEXT: v5.uw = vcl0(v2.uw)
|
|
; CHECK-NEXT: v7 = vxor(v7,v7)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r4 = #159
|
|
; CHECK-NEXT: v6.uw = vcl0(v3.uw)
|
|
; CHECK-NEXT: v5.w = vadd(v5.w,v4.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v28 = vsplat(r4)
|
|
; CHECK-NEXT: r5 = ##-2147483648
|
|
; CHECK-NEXT: v6.w = vadd(v6.w,v4.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v13 = vsplat(r5)
|
|
; CHECK-NEXT: v2.w = vasl(v2.w,v5.w)
|
|
; CHECK-NEXT: q0 = vcmp.gt(v7.w,v1.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v3.w = vasl(v3.w,v6.w)
|
|
; CHECK-NEXT: v27 = vmux(q0,v13,v7)
|
|
; CHECK-NEXT: v10.w = vadd(v2.w,v8.w)
|
|
; CHECK-NEXT: v11 = vand(v2,v9)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v9 = vand(v3,v9)
|
|
; CHECK-NEXT: q1 = vcmp.eq(v11.w,v7.w)
|
|
; CHECK-NEXT: v8.w = vadd(v3.w,v8.w)
|
|
; CHECK-NEXT: q2 = vcmp.gt(v2.uw,v10.uw)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v12.uw = vlsr(v2.uw,r3)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v9.w,v7.w)
|
|
; CHECK-NEXT: v23 = vmux(q1,v7,v4)
|
|
; CHECK-NEXT: q1 = vcmp.gt(v3.uw,v8.uw)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2.uw = vlsr(v10.uw,r3)
|
|
; CHECK-NEXT: v25 = vmux(q3,v7,v4)
|
|
; CHECK-NEXT: v24 = vmux(q2,v4,v7)
|
|
; CHECK-NEXT: v4 = vmux(q1,v4,v7)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v8.uw = vlsr(v8.uw,r3)
|
|
; CHECK-NEXT: v9.w = vadd(v2.w,v23.w)
|
|
; CHECK-NEXT: v5.w = vsub(v24.w,v5.w)
|
|
; CHECK-NEXT: v4.w = vsub(v4.w,v6.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v3.uw = vlsr(v3.uw,r3)
|
|
; CHECK-NEXT: v26.w = vadd(v8.w,v25.w)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v12.w,v2.w)
|
|
; CHECK-NEXT: v5.w = vadd(v5.w,v28.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r3 = #23
|
|
; CHECK-NEXT: v2.uw = vlsr(v2.uw,r2)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v3.w,v8.w)
|
|
; CHECK-NEXT: v4.w = vadd(v4.w,v28.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v9.uw = vlsr(v9.uw,r2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v29.uw = vlsr(v26.uw,r2)
|
|
; CHECK-NEXT: v2 = vmux(q3,v9,v2)
|
|
; CHECK-NEXT: q3 = vcmp.gt(v7.w,v0.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v3.uw = vlsr(v8.uw,r2)
|
|
; CHECK-NEXT: v30 = vmux(q3,v13,v7)
|
|
; CHECK-NEXT: v2 = vor(v27,v2)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v0.w,v7.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v5.w = vasl(v5.w,r3)
|
|
; CHECK-NEXT: v3 = vmux(q2,v29,v3)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v1.w,v7.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v3.w = vasl(v4.w,r3)
|
|
; CHECK-NEXT: v31 = vor(v30,v3)
|
|
; CHECK-NEXT: v2 = vor(v2,v5)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1 = vor(v31,v3)
|
|
; CHECK-NEXT: v2 = vmux(q2,v7,v2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vmux(q3,v7,v1)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2.qf32 = vadd(v2.sf,v7.sf)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v3.qf32 = vadd(v0.sf,v7.sf)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0.hf = v3:2.qf32
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0.h = vdeal(v0.h)
|
|
; CHECK-NEXT: jumpr r31
|
|
; CHECK-NEXT: vmem(r1+#0) = v0.new
|
|
; CHECK-NEXT: }
|
|
%v0 = load <64 x i32>, ptr %a0, align 128
|
|
%v1 = sitofp <64 x i32> %v0 to <64 x half>
|
|
store <64 x half> %v1, ptr %a1, align 128
|
|
ret void
|
|
}
|
|
|
|
; Widen result
|
|
define void @s32f16_1(ptr %a0, ptr %a1) #0 {
|
|
; CHECK-LABEL: s32f16_1:
|
|
; CHECK: .cfi_startproc
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r6 = #1
|
|
; CHECK-NEXT: v1.w = vabs(v0.w)
|
|
; CHECK-NEXT: v0.cur = vmem(r0+#0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2 = vsplat(r6)
|
|
; CHECK-NEXT: r3:2 = combine(##255,#8)
|
|
; CHECK-NEXT: r4 = #512
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v5 = vsplat(r3)
|
|
; CHECK-NEXT: v6 = vsplat(r4)
|
|
; CHECK-NEXT: v4.uw = vcl0(v1.uw)
|
|
; CHECK-NEXT: v3 = vxor(v3,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r5 = #159
|
|
; CHECK-NEXT: r4 = ##-2147483648
|
|
; CHECK-NEXT: v4.w = vadd(v4.w,v2.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v28 = vsplat(r5)
|
|
; CHECK-NEXT: v29 = vsplat(r4)
|
|
; CHECK-NEXT: q3 = vcmp.gt(v3.w,v0.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r3 = #23
|
|
; CHECK-NEXT: v1.w = vasl(v1.w,v4.w)
|
|
; CHECK-NEXT: v31 = vmux(q3,v29,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v5.w = vadd(v1.w,v5.w)
|
|
; CHECK-NEXT: v6 = vand(v1,v6)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7.uw = vlsr(v1.uw,r2)
|
|
; CHECK-NEXT: q0 = vcmp.eq(v6.w,v3.w)
|
|
; CHECK-NEXT: q1 = vcmp.gt(v1.uw,v5.uw)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r2 = #64
|
|
; CHECK-NEXT: v1.uw = vlsr(v5.uw,r2)
|
|
; CHECK-NEXT: v27 = vmux(q0,v3,v2)
|
|
; CHECK-NEXT: v2 = vmux(q1,v2,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: q3 = vsetq(r2)
|
|
; CHECK-NEXT: v5.w = vadd(v1.w,v27.w)
|
|
; CHECK-NEXT: v2.w = vsub(v2.w,v4.w)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v7.w,v1.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.uw = vlsr(v1.uw,r6)
|
|
; CHECK-NEXT: v2.w = vadd(v2.w,v28.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v30.uw = vlsr(v5.uw,r6)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2.w = vasl(v2.w,r3)
|
|
; CHECK-NEXT: v1 = vmux(q2,v30,v1)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v0.w,v3.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1 = vor(v31,v1)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.qf32 = vadd(v3.sf,v3.sf)
|
|
; CHECK-NEXT: v0 = vor(v1,v2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vmux(q2,v3,v0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0.qf32 = vadd(v0.sf,v3.sf)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0.hf = v1:0.qf32
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0.h = vdeal(v0.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: jumpr r31
|
|
; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
|
|
; CHECK-NEXT: }
|
|
%v0 = load <32 x i32>, ptr %a0, align 128
|
|
%v1 = sitofp <32 x i32> %v0 to <32 x half>
|
|
store <32 x half> %v1, ptr %a1, align 128
|
|
ret void
|
|
}
|
|
|
|
; s32 -> f32
|
|
; No widening
|
|
define void @s32f32_0(ptr %a0, ptr %a1) #0 {
|
|
; CHECK-LABEL: s32f32_0:
|
|
; CHECK: .cfi_startproc
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r0 = #1
|
|
; CHECK-NEXT: r2 = #255
|
|
; CHECK-NEXT: v1.w = vabs(v0.w)
|
|
; CHECK-NEXT: v0.cur = vmem(r0+#0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v3 = vsplat(r0)
|
|
; CHECK-NEXT: v5 = vsplat(r2)
|
|
; CHECK-NEXT: r3 = #512
|
|
; CHECK-NEXT: v2 = vxor(v2,v2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6 = vsplat(r3)
|
|
; CHECK-NEXT: r7:6 = combine(##-2147483648,#8)
|
|
; CHECK-NEXT: v4.uw = vcl0(v1.uw)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r4 = #159
|
|
; CHECK-NEXT: v4.w = vadd(v4.w,v3.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7 = vsplat(r4)
|
|
; CHECK-NEXT: v29 = vsplat(r7)
|
|
; CHECK-NEXT: r2 = #23
|
|
; CHECK-NEXT: q2 = vcmp.gt(v2.w,v0.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.w = vasl(v1.w,v4.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v5.w = vadd(v1.w,v5.w)
|
|
; CHECK-NEXT: v6 = vand(v1,v6)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.uw = vlsr(v1.uw,r6)
|
|
; CHECK-NEXT: q0 = vcmp.eq(v6.w,v2.w)
|
|
; CHECK-NEXT: q1 = vcmp.gt(v1.uw,v5.uw)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v5.uw = vlsr(v5.uw,r6)
|
|
; CHECK-NEXT: v6 = vmux(q0,v2,v3)
|
|
; CHECK-NEXT: v3 = vmux(q1,v3,v2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.w = vadd(v5.w,v6.w)
|
|
; CHECK-NEXT: v27.w = vsub(v3.w,v4.w)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v1.w,v5.w)
|
|
; CHECK-NEXT: v4 = vmux(q2,v29,v2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v28.uw = vlsr(v5.uw,r0)
|
|
; CHECK-NEXT: v1.w = vadd(v27.w,v7.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v30.uw = vlsr(v6.uw,r0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.w = vasl(v1.w,r2)
|
|
; CHECK-NEXT: v3 = vmux(q3,v30,v28)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v0.w,v2.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v3 = vor(v4,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v31 = vor(v3,v1)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vmux(q3,v2,v31)
|
|
; CHECK-NEXT: jumpr r31
|
|
; CHECK-NEXT: vmem(r1+#0) = v0.new
|
|
; CHECK-NEXT: }
|
|
%v0 = load <32 x i32>, ptr %a0, align 128
|
|
%v1 = sitofp <32 x i32> %v0 to <32 x float>
|
|
store <32 x float> %v1, ptr %a1, align 128
|
|
ret void
|
|
}
|
|
|
|
; Widen input and result
|
|
define void @s32f32_1(ptr %a0, ptr %a1) #0 {
|
|
; CHECK-LABEL: s32f32_1:
|
|
; CHECK: .cfi_startproc
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r0 = #1
|
|
; CHECK-NEXT: r2 = #255
|
|
; CHECK-NEXT: v1.w = vabs(v0.w)
|
|
; CHECK-NEXT: v0.cur = vmem(r0+#0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2 = vsplat(r0)
|
|
; CHECK-NEXT: v5 = vsplat(r2)
|
|
; CHECK-NEXT: r3 = #512
|
|
; CHECK-NEXT: v3 = vxor(v3,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6 = vsplat(r3)
|
|
; CHECK-NEXT: r7:6 = combine(##-2147483648,#8)
|
|
; CHECK-NEXT: v4.uw = vcl0(v1.uw)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r4 = #159
|
|
; CHECK-NEXT: v4.w = vadd(v4.w,v2.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7 = vsplat(r4)
|
|
; CHECK-NEXT: v29 = vsplat(r7)
|
|
; CHECK-NEXT: r3 = #23
|
|
; CHECK-NEXT: q3 = vcmp.gt(v3.w,v0.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r2 = #64
|
|
; CHECK-NEXT: v1.w = vasl(v1.w,v4.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v5.w = vadd(v1.w,v5.w)
|
|
; CHECK-NEXT: v6 = vand(v1,v6)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.uw = vlsr(v1.uw,r6)
|
|
; CHECK-NEXT: q0 = vcmp.eq(v6.w,v3.w)
|
|
; CHECK-NEXT: q1 = vcmp.gt(v1.uw,v5.uw)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v5.uw = vlsr(v5.uw,r6)
|
|
; CHECK-NEXT: v6 = vmux(q0,v3,v2)
|
|
; CHECK-NEXT: v2 = vmux(q1,v2,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.w = vadd(v5.w,v6.w)
|
|
; CHECK-NEXT: v27.w = vsub(v2.w,v4.w)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v1.w,v5.w)
|
|
; CHECK-NEXT: v4 = vmux(q3,v29,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v28.uw = vlsr(v5.uw,r0)
|
|
; CHECK-NEXT: q3 = vsetq(r2)
|
|
; CHECK-NEXT: v1.w = vadd(v27.w,v7.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v30.uw = vlsr(v6.uw,r0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.w = vasl(v1.w,r3)
|
|
; CHECK-NEXT: v2 = vmux(q2,v30,v28)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v0.w,v3.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2 = vor(v4,v2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v31 = vor(v2,v1)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vmux(q2,v3,v31)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: jumpr r31
|
|
; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
|
|
; CHECK-NEXT: }
|
|
%v0 = load <16 x i32>, ptr %a0, align 128
|
|
%v1 = sitofp <16 x i32> %v0 to <16 x float>
|
|
store <16 x float> %v1, ptr %a1, align 128
|
|
ret void
|
|
}
|
|
|
|
|
|
; u8 -> f16
|
|
; No widening
|
|
define void @u8f16_0(ptr %a0, ptr %a1) #0 {
|
|
; CHECK-LABEL: u8f16_0:
|
|
; CHECK: .cfi_startproc
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vmem(r0+#0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r6 = #1
|
|
; CHECK-NEXT: r3:2 = combine(#31,#5)
|
|
; CHECK-NEXT: v1:0.uh = vunpack(v0.ub)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2.h = vsplat(r6)
|
|
; CHECK-NEXT: v5.h = vsplat(r3)
|
|
; CHECK-NEXT: r5 = #64
|
|
; CHECK-NEXT: v3 = vxor(v3,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.h = vsplat(r5)
|
|
; CHECK-NEXT: r4 = #10
|
|
; CHECK-NEXT: v4.uh = vcl0(v0.uh)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7.uh = vcl0(v1.uh)
|
|
; CHECK-NEXT: v4.h = vadd(v4.h,v2.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7.h = vadd(v7.h,v2.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v8.h = vasl(v0.h,v4.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v11.h = vasl(v1.h,v7.h)
|
|
; CHECK-NEXT: v10 = vand(v8,v6)
|
|
; CHECK-NEXT: v9.h = vadd(v8.h,v5.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v22.h = vadd(v11.h,v5.h)
|
|
; CHECK-NEXT: v6 = vand(v11,v6)
|
|
; CHECK-NEXT: q0 = vcmp.gt(v8.uh,v9.uh)
|
|
; CHECK-NEXT: q1 = vcmp.eq(v10.h,v3.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v21.uh = vlsr(v8.uh,r2)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v6.h,v3.h)
|
|
; CHECK-NEXT: q3 = vcmp.gt(v11.uh,v22.uh)
|
|
; CHECK-NEXT: v12 = vmux(q1,v3,v2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v9.uh = vlsr(v9.uh,r2)
|
|
; CHECK-NEXT: v13 = vmux(q2,v3,v2)
|
|
; CHECK-NEXT: v25 = vmux(q0,v2,v3)
|
|
; CHECK-NEXT: v2 = vmux(q3,v2,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v8.uh = vlsr(v22.uh,r2)
|
|
; CHECK-NEXT: v24.h = vadd(v9.h,v12.h)
|
|
; CHECK-NEXT: v2.h = vadd(v2.h,v5.h)
|
|
; CHECK-NEXT: v12.h = vadd(v25.h,v5.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v23.uh = vlsr(v11.uh,r2)
|
|
; CHECK-NEXT: v13.h = vadd(v8.h,v13.h)
|
|
; CHECK-NEXT: v4.h = vsub(v12.h,v4.h)
|
|
; CHECK-NEXT: v2.h = vsub(v2.h,v7.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v14.uh = vlsr(v9.uh,r6)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v21.h,v9.h)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v23.h,v8.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v26.uh = vlsr(v24.uh,r6)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v27.uh = vlsr(v13.uh,r6)
|
|
; CHECK-NEXT: v5 = vmux(q2,v26,v14)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v1.h,v3.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v28.uh = vlsr(v8.uh,r6)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v4.h = vasl(v4.h,r4)
|
|
; CHECK-NEXT: v6 = vmux(q3,v27,v28)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v0.h,v3.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2.h = vasl(v2.h,r4)
|
|
; CHECK-NEXT: v29 = vor(v5,v4)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2 = vor(v6,v2)
|
|
; CHECK-NEXT: v31 = vmux(q3,v3,v29)
|
|
; CHECK-NEXT: vmem(r1+#0) = v31.new
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v30 = vmux(q2,v3,v2)
|
|
; CHECK-NEXT: jumpr r31
|
|
; CHECK-NEXT: vmem(r1+#1) = v30.new
|
|
; CHECK-NEXT: }
|
|
%v0 = load <128 x i8>, ptr %a0, align 128
|
|
%v1 = uitofp <128 x i8> %v0 to <128 x half>
|
|
store <128 x half> %v1, ptr %a1, align 128
|
|
ret void
|
|
}
|
|
|
|
; Widen input
|
|
define void @u8f16_1(ptr %a0, ptr %a1) #0 {
|
|
; CHECK-LABEL: u8f16_1:
|
|
; CHECK: .cfi_startproc
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vmem(r0+#0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r6 = #1
|
|
; CHECK-NEXT: r3:2 = combine(#64,#31)
|
|
; CHECK-NEXT: v1:0.uh = vunpack(v0.ub)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.h = vsplat(r6)
|
|
; CHECK-NEXT: v4.h = vsplat(r2)
|
|
; CHECK-NEXT: r5 = #5
|
|
; CHECK-NEXT: v2 = vxor(v2,v2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v5.h = vsplat(r3)
|
|
; CHECK-NEXT: r4 = #10
|
|
; CHECK-NEXT: v3.uh = vcl0(v0.uh)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v0.h,v2.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v3.h = vadd(v3.h,v1.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.h = vasl(v0.h,v3.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7.h = vadd(v6.h,v4.h)
|
|
; CHECK-NEXT: v5 = vand(v6,v5)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.uh = vlsr(v6.uh,r5)
|
|
; CHECK-NEXT: q0 = vcmp.gt(v6.uh,v7.uh)
|
|
; CHECK-NEXT: q1 = vcmp.eq(v5.h,v2.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v26.uh = vlsr(v7.uh,r5)
|
|
; CHECK-NEXT: v27 = vmux(q1,v2,v1)
|
|
; CHECK-NEXT: v1 = vmux(q0,v1,v2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.h = vadd(v1.h,v4.h)
|
|
; CHECK-NEXT: v28.h = vadd(v26.h,v27.h)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v6.h,v26.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v29.uh = vlsr(v26.uh,r6)
|
|
; CHECK-NEXT: v1.h = vsub(v1.h,v3.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v30.uh = vlsr(v28.uh,r6)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.h = vasl(v1.h,r4)
|
|
; CHECK-NEXT: v3 = vmux(q2,v30,v29)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v31 = vor(v3,v1)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vmux(q3,v2,v31)
|
|
; CHECK-NEXT: jumpr r31
|
|
; CHECK-NEXT: vmem(r1+#0) = v0.new
|
|
; CHECK-NEXT: }
|
|
%v0 = load <64 x i8>, ptr %a0, align 128
|
|
%v1 = uitofp <64 x i8> %v0 to <64 x half>
|
|
store <64 x half> %v1, ptr %a1, align 128
|
|
ret void
|
|
}
|
|
|
|
|
|
; u8 -> f32
|
|
; No widening
|
|
define void @u8f32_0(ptr %a0, ptr %a1) #0 {
|
|
; CHECK-LABEL: u8f32_0:
|
|
; CHECK: .cfi_startproc
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r3:2 = combine(##.LCPI15_0,#8)
|
|
; CHECK-NEXT: v3:2.uh = vunpack(v1.ub)
|
|
; CHECK-NEXT: v1.cur = vmem(r0+#0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r0 = #1
|
|
; CHECK-NEXT: r6 = #512
|
|
; CHECK-NEXT: r7 = #255
|
|
; CHECK-NEXT: v3 = vmem(r3+#0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1 = vsplat(r0)
|
|
; CHECK-NEXT: v16 = vsplat(r6)
|
|
; CHECK-NEXT: v3 = vdelta(v1,v3)
|
|
; CHECK-NEXT: v0 = vxor(v0,v0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v10 = vsplat(r7)
|
|
; CHECK-NEXT: r5 = #159
|
|
; CHECK-NEXT: v5:4.uw = vunpack(v2.uh)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v19 = vsplat(r5)
|
|
; CHECK-NEXT: r4 = #23
|
|
; CHECK-NEXT: v31:30.uh = vunpack(v3.ub)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.uw = vcl0(v4.uw)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v3:2.uw = vunpack(v30.uh)
|
|
; CHECK-NEXT: v6.w = vadd(v6.w,v1.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7.uw = vcl0(v5.uw)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v11.uw = vcl0(v2.uw)
|
|
; CHECK-NEXT: v7.w = vadd(v7.w,v1.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v12.uw = vcl0(v3.uw)
|
|
; CHECK-NEXT: v11.w = vadd(v11.w,v1.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v8.w = vasl(v4.w,v6.w)
|
|
; CHECK-NEXT: v12.w = vadd(v12.w,v1.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v9.w = vasl(v5.w,v7.w)
|
|
; CHECK-NEXT: v20 = vand(v8,v16)
|
|
; CHECK-NEXT: v17.w = vadd(v8.w,v10.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v18.w = vasl(v2.w,v11.w)
|
|
; CHECK-NEXT: v22 = vand(v9,v16)
|
|
; CHECK-NEXT: q1 = vcmp.eq(v20.w,v0.w)
|
|
; CHECK-NEXT: v13.w = vadd(v9.w,v10.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v21.w = vasl(v3.w,v12.w)
|
|
; CHECK-NEXT: v28.w = vadd(v18.w,v10.w)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v22.w,v0.w)
|
|
; CHECK-NEXT: v25 = vand(v18,v16)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v29 = vmux(q1,v0,v1)
|
|
; CHECK-NEXT: v24 = vmux(q2,v0,v1)
|
|
; CHECK-NEXT: v16 = vand(v21,v16)
|
|
; CHECK-NEXT: q1 = vcmp.eq(v25.w,v0.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v26.uw = vlsr(v28.uw,r2)
|
|
; CHECK-NEXT: v10.w = vadd(v21.w,v10.w)
|
|
; CHECK-NEXT: q2 = vcmp.gt(v18.uw,v28.uw)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v16.w,v0.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v20.uw = vlsr(v18.uw,r2)
|
|
; CHECK-NEXT: q0 = vcmp.gt(v9.uw,v13.uw)
|
|
; CHECK-NEXT: v18 = vmux(q2,v1,v0)
|
|
; CHECK-NEXT: v30 = vmux(q1,v0,v1)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v15.uw = vlsr(v13.uw,r2)
|
|
; CHECK-NEXT: q2 = vcmp.gt(v8.uw,v17.uw)
|
|
; CHECK-NEXT: v13.w = vadd(v26.w,v30.w)
|
|
; CHECK-NEXT: v27 = vmux(q3,v0,v1)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v23.uw = vlsr(v17.uw,r2)
|
|
; CHECK-NEXT: v30 = vmux(q0,v1,v0)
|
|
; CHECK-NEXT: q3 = vcmp.gt(v21.uw,v10.uw)
|
|
; CHECK-NEXT: v11.w = vsub(v18.w,v11.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v25.uw = vlsr(v10.uw,r2)
|
|
; CHECK-NEXT: v7.w = vsub(v30.w,v7.w)
|
|
; CHECK-NEXT: v22.w = vadd(v23.w,v29.w)
|
|
; CHECK-NEXT: v29.w = vadd(v15.w,v24.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v16.uw = vlsr(v21.uw,r2)
|
|
; CHECK-NEXT: v21 = vmux(q2,v1,v0)
|
|
; CHECK-NEXT: v31.w = vadd(v25.w,v27.w)
|
|
; CHECK-NEXT: v1 = vmux(q3,v1,v0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v14.uw = vlsr(v8.uw,r2)
|
|
; CHECK-NEXT: v6.w = vsub(v21.w,v6.w)
|
|
; CHECK-NEXT: v7.w = vadd(v7.w,v19.w)
|
|
; CHECK-NEXT: v1.w = vsub(v1.w,v12.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v9.uw = vlsr(v9.uw,r2)
|
|
; CHECK-NEXT: v6.w = vadd(v6.w,v19.w)
|
|
; CHECK-NEXT: v11.w = vadd(v11.w,v19.w)
|
|
; CHECK-NEXT: v1.w = vadd(v1.w,v19.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v18.uw = vlsr(v31.uw,r0)
|
|
; CHECK-NEXT: q1 = vcmp.eq(v20.w,v26.w)
|
|
; CHECK-NEXT: q0 = vcmp.eq(v16.w,v25.w)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v14.w,v23.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v27.uw = vlsr(v25.uw,r0)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v9.w,v15.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v20.uw = vlsr(v22.uw,r0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v31.uw = vlsr(v23.uw,r0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v8.uw = vlsr(v29.uw,r0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v24.uw = vlsr(v15.uw,r0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v28.uw = vlsr(v26.uw,r0)
|
|
; CHECK-NEXT: v26 = vmux(q0,v18,v27)
|
|
; CHECK-NEXT: v8 = vmux(q3,v8,v24)
|
|
; CHECK-NEXT: v27 = vmux(q2,v20,v31)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7.w = vasl(v7.w,r4)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v5.w,v0.w)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v4.w,v0.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v13.uw = vlsr(v13.uw,r0)
|
|
; CHECK-NEXT: v7 = vor(v8,v7)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.w = vasl(v6.w,r4)
|
|
; CHECK-NEXT: v25 = vmux(q1,v13,v28)
|
|
; CHECK-NEXT: v29 = vmux(q2,v0,v7)
|
|
; CHECK-NEXT: vmem(r1+#1) = v29.new
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.w = vasl(v1.w,r4)
|
|
; CHECK-NEXT: v28 = vor(v27,v6)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v3.w,v0.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v11.w = vasl(v11.w,r4)
|
|
; CHECK-NEXT: v1 = vor(v26,v1)
|
|
; CHECK-NEXT: v30 = vmux(q3,v0,v28)
|
|
; CHECK-NEXT: vmem(r1+#0) = v30.new
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v31 = vor(v25,v11)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v2.w,v0.w)
|
|
; CHECK-NEXT: v1 = vmux(q2,v0,v1)
|
|
; CHECK-NEXT: vmem(r1+#3) = v1.new
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vmux(q3,v0,v31)
|
|
; CHECK-NEXT: jumpr r31
|
|
; CHECK-NEXT: vmem(r1+#2) = v0.new
|
|
; CHECK-NEXT: }
|
|
%v0 = load <128 x i8>, ptr %a0, align 128
|
|
%v1 = uitofp <128 x i8> %v0 to <128 x float>
|
|
store <128 x float> %v1, ptr %a1, align 128
|
|
ret void
|
|
}
|
|
|
|
; Widen input #1
|
|
define void @u8f32_1(ptr %a0, ptr %a1) #0 {
|
|
; CHECK-LABEL: u8f32_1:
|
|
; CHECK: .cfi_startproc
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vmem(r0+#0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r7 = #1
|
|
; CHECK-NEXT: r6 = #512
|
|
; CHECK-NEXT: v1:0.uh = vunpack(v0.ub)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2 = vsplat(r7)
|
|
; CHECK-NEXT: v8 = vsplat(r6)
|
|
; CHECK-NEXT: r3:2 = combine(##255,#8)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6 = vsplat(r3)
|
|
; CHECK-NEXT: r5 = #159
|
|
; CHECK-NEXT: v1:0.uw = vunpack(v0.uh)
|
|
; CHECK-NEXT: v3 = vxor(v3,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v13 = vsplat(r5)
|
|
; CHECK-NEXT: r4 = #23
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v4.uw = vcl0(v0.uw)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v5.uw = vcl0(v1.uw)
|
|
; CHECK-NEXT: v4.w = vadd(v4.w,v2.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v5.w = vadd(v5.w,v2.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7.w = vasl(v0.w,v4.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v9.w = vasl(v1.w,v5.w)
|
|
; CHECK-NEXT: v11 = vand(v7,v8)
|
|
; CHECK-NEXT: v10.w = vadd(v7.w,v6.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.w = vadd(v9.w,v6.w)
|
|
; CHECK-NEXT: q0 = vcmp.eq(v11.w,v3.w)
|
|
; CHECK-NEXT: v8 = vand(v9,v8)
|
|
; CHECK-NEXT: q1 = vcmp.gt(v7.uw,v10.uw)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v19.uw = vlsr(v10.uw,r2)
|
|
; CHECK-NEXT: v21 = vmux(q0,v3,v2)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v8.w,v3.w)
|
|
; CHECK-NEXT: q0 = vcmp.gt(v9.uw,v6.uw)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v20.uw = vlsr(v6.uw,r2)
|
|
; CHECK-NEXT: v22 = vmux(q1,v2,v3)
|
|
; CHECK-NEXT: v24 = vmux(q3,v3,v2)
|
|
; CHECK-NEXT: v2 = vmux(q0,v2,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v4.w = vsub(v22.w,v4.w)
|
|
; CHECK-NEXT: v2.w = vsub(v2.w,v5.w)
|
|
; CHECK-NEXT: v10.w = vadd(v19.w,v21.w)
|
|
; CHECK-NEXT: v25.w = vadd(v20.w,v24.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v12.uw = vlsr(v7.uw,r2)
|
|
; CHECK-NEXT: v4.w = vadd(v4.w,v13.w)
|
|
; CHECK-NEXT: v2.w = vadd(v2.w,v13.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v23.uw = vlsr(v9.uw,r2)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v12.w,v19.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v11.uw = vlsr(v19.uw,r7)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v23.w,v20.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v27.uw = vlsr(v10.uw,r7)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v26.uw = vlsr(v20.uw,r7)
|
|
; CHECK-NEXT: v5 = vmux(q2,v27,v11)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v1.w,v3.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.uw = vlsr(v25.uw,r7)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v4.w = vasl(v4.w,r4)
|
|
; CHECK-NEXT: v6 = vmux(q3,v6,v26)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2.w = vasl(v2.w,r4)
|
|
; CHECK-NEXT: v29 = vor(v5,v4)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v28 = vor(v6,v2)
|
|
; CHECK-NEXT: v31 = vmux(q3,v3,v29)
|
|
; CHECK-NEXT: vmem(r1+#0) = v31.new
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v30 = vmux(q2,v3,v28)
|
|
; CHECK-NEXT: jumpr r31
|
|
; CHECK-NEXT: vmem(r1+#1) = v30.new
|
|
; CHECK-NEXT: }
|
|
%v0 = load <64 x i8>, ptr %a0, align 128
|
|
%v1 = uitofp <64 x i8> %v0 to <64 x float>
|
|
store <64 x float> %v1, ptr %a1, align 128
|
|
ret void
|
|
}
|
|
|
|
; Widen input #2
|
|
define void @u8f32_2(ptr %a0, ptr %a1) #0 {
|
|
; CHECK-LABEL: u8f32_2:
|
|
; CHECK: .cfi_startproc
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vmem(r0+#0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r6 = #1
|
|
; CHECK-NEXT: r2 = #255
|
|
; CHECK-NEXT: v1:0.uh = vunpack(v0.ub)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1 = vsplat(r6)
|
|
; CHECK-NEXT: v29 = vsplat(r2)
|
|
; CHECK-NEXT: r3 = #512
|
|
; CHECK-NEXT: v2 = vxor(v2,v2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v3 = vsplat(r3)
|
|
; CHECK-NEXT: r5:4 = combine(##159,#8)
|
|
; CHECK-NEXT: v5:4.uw = vunpack(v0.uh)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7 = vsplat(r5)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v4.w,v2.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v5.uw = vcl0(v4.uw)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v5.w = vadd(v5.w,v1.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.w = vasl(v4.w,v5.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0.w = vadd(v6.w,v29.w)
|
|
; CHECK-NEXT: v3 = vand(v6,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.uw = vlsr(v6.uw,r4)
|
|
; CHECK-NEXT: q0 = vcmp.gt(v6.uw,v0.uw)
|
|
; CHECK-NEXT: q1 = vcmp.eq(v3.w,v2.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r4 = #23
|
|
; CHECK-NEXT: v0.uw = vlsr(v0.uw,r4)
|
|
; CHECK-NEXT: v3 = vmux(q1,v2,v1)
|
|
; CHECK-NEXT: v1 = vmux(q0,v1,v2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.w = vsub(v1.w,v5.w)
|
|
; CHECK-NEXT: v3.w = vadd(v0.w,v3.w)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v6.w,v0.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v30.uw = vlsr(v0.uw,r6)
|
|
; CHECK-NEXT: v1.w = vadd(v1.w,v7.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v31.uw = vlsr(v3.uw,r6)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.w = vasl(v1.w,r4)
|
|
; CHECK-NEXT: v0 = vmux(q2,v31,v30)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vor(v0,v1)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vmux(q3,v2,v0)
|
|
; CHECK-NEXT: jumpr r31
|
|
; CHECK-NEXT: vmem(r1+#0) = v0.new
|
|
; CHECK-NEXT: }
|
|
%v0 = load <32 x i8>, ptr %a0, align 128
|
|
%v1 = uitofp <32 x i8> %v0 to <32 x float>
|
|
store <32 x float> %v1, ptr %a1, align 128
|
|
ret void
|
|
}
|
|
|
|
|
|
; u16 -> f16
|
|
; No widening
|
|
define void @u16f16_0(ptr %a0, ptr %a1) #0 {
|
|
; CHECK-LABEL: u16f16_0:
|
|
; CHECK: .cfi_startproc
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r3:2 = combine(#64,#1)
|
|
; CHECK-NEXT: r5 = #31
|
|
; CHECK-NEXT: v1.uh = vcl0(v0.uh)
|
|
; CHECK-NEXT: v0.cur = vmem(r0+#0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2.h = vsplat(r2)
|
|
; CHECK-NEXT: v5.h = vsplat(r3)
|
|
; CHECK-NEXT: r4 = #5
|
|
; CHECK-NEXT: v3 = vxor(v3,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v4.h = vsplat(r5)
|
|
; CHECK-NEXT: r3 = #10
|
|
; CHECK-NEXT: v1.h = vadd(v1.h,v2.h)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v0.h,v3.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.h = vasl(v0.h,v1.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7.h = vadd(v6.h,v4.h)
|
|
; CHECK-NEXT: v5 = vand(v6,v5)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.uh = vlsr(v6.uh,r4)
|
|
; CHECK-NEXT: q0 = vcmp.eq(v5.h,v3.h)
|
|
; CHECK-NEXT: q1 = vcmp.gt(v6.uh,v7.uh)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v26.uh = vlsr(v7.uh,r4)
|
|
; CHECK-NEXT: v27 = vmux(q0,v3,v2)
|
|
; CHECK-NEXT: v2 = vmux(q1,v2,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2.h = vadd(v2.h,v4.h)
|
|
; CHECK-NEXT: v28.h = vadd(v26.h,v27.h)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v6.h,v26.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v29.uh = vlsr(v26.uh,r2)
|
|
; CHECK-NEXT: v1.h = vsub(v2.h,v1.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v30.uh = vlsr(v28.uh,r2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.h = vasl(v1.h,r3)
|
|
; CHECK-NEXT: v2 = vmux(q2,v30,v29)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v31 = vor(v2,v1)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vmux(q3,v3,v31)
|
|
; CHECK-NEXT: jumpr r31
|
|
; CHECK-NEXT: vmem(r1+#0) = v0.new
|
|
; CHECK-NEXT: }
|
|
%v0 = load <64 x i16>, ptr %a0, align 128
|
|
%v1 = uitofp <64 x i16> %v0 to <64 x half>
|
|
store <64 x half> %v1, ptr %a1, align 128
|
|
ret void
|
|
}
|
|
|
|
; Widen input and result
|
|
define void @u16f16_1(ptr %a0, ptr %a1) #0 {
|
|
; CHECK-LABEL: u16f16_1:
|
|
; CHECK: .cfi_startproc
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r3:2 = combine(#31,#1)
|
|
; CHECK-NEXT: r6 = #64
|
|
; CHECK-NEXT: v1.uh = vcl0(v0.uh)
|
|
; CHECK-NEXT: v0.cur = vmem(r0+#0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2.h = vsplat(r2)
|
|
; CHECK-NEXT: v4.h = vsplat(r3)
|
|
; CHECK-NEXT: r5 = #5
|
|
; CHECK-NEXT: v3 = vxor(v3,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v5.h = vsplat(r6)
|
|
; CHECK-NEXT: r4 = #10
|
|
; CHECK-NEXT: v1.h = vadd(v1.h,v2.h)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v0.h,v3.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: q3 = vsetq(r6)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.h = vasl(v0.h,v1.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7.h = vadd(v6.h,v4.h)
|
|
; CHECK-NEXT: v5 = vand(v6,v5)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.uh = vlsr(v6.uh,r5)
|
|
; CHECK-NEXT: q1 = vcmp.eq(v5.h,v3.h)
|
|
; CHECK-NEXT: q0 = vcmp.gt(v6.uh,v7.uh)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7.uh = vlsr(v7.uh,r5)
|
|
; CHECK-NEXT: v5 = vmux(q1,v3,v2)
|
|
; CHECK-NEXT: v2 = vmux(q0,v2,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2.h = vadd(v2.h,v4.h)
|
|
; CHECK-NEXT: v28.h = vadd(v7.h,v5.h)
|
|
; CHECK-NEXT: q1 = vcmp.eq(v6.h,v7.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v29.uh = vlsr(v7.uh,r2)
|
|
; CHECK-NEXT: v1.h = vsub(v2.h,v1.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v30.uh = vlsr(v28.uh,r2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.h = vasl(v1.h,r4)
|
|
; CHECK-NEXT: v2 = vmux(q1,v30,v29)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v31 = vor(v2,v1)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vmux(q2,v3,v31)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: jumpr r31
|
|
; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
|
|
; CHECK-NEXT: }
|
|
%v0 = load <32 x i16>, ptr %a0, align 128
|
|
%v1 = uitofp <32 x i16> %v0 to <32 x half>
|
|
store <32 x half> %v1, ptr %a1, align 128
|
|
ret void
|
|
}
|
|
|
|
|
|
; u16 -> f32
|
|
; No widening
|
|
define void @u16f32_0(ptr %a0, ptr %a1) #0 {
|
|
; CHECK-LABEL: u16f32_0:
|
|
; CHECK: .cfi_startproc
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vmem(r0+#0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r7 = #1
|
|
; CHECK-NEXT: r3:2 = combine(##255,#8)
|
|
; CHECK-NEXT: v1:0.uw = vunpack(v0.uh)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2 = vsplat(r7)
|
|
; CHECK-NEXT: v6 = vsplat(r3)
|
|
; CHECK-NEXT: r6 = #512
|
|
; CHECK-NEXT: v3 = vxor(v3,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v8 = vsplat(r6)
|
|
; CHECK-NEXT: r5 = #159
|
|
; CHECK-NEXT: r4 = #23
|
|
; CHECK-NEXT: v4.uw = vcl0(v0.uw)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v14 = vsplat(r5)
|
|
; CHECK-NEXT: v5.uw = vcl0(v1.uw)
|
|
; CHECK-NEXT: v4.w = vadd(v4.w,v2.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v5.w = vadd(v5.w,v2.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7.w = vasl(v0.w,v4.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v9.w = vasl(v1.w,v5.w)
|
|
; CHECK-NEXT: v10.w = vadd(v7.w,v6.w)
|
|
; CHECK-NEXT: v11 = vand(v7,v8)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.w = vadd(v9.w,v6.w)
|
|
; CHECK-NEXT: v8 = vand(v9,v8)
|
|
; CHECK-NEXT: q0 = vcmp.eq(v11.w,v3.w)
|
|
; CHECK-NEXT: q1 = vcmp.gt(v7.uw,v10.uw)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v19.uw = vlsr(v10.uw,r2)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v8.w,v3.w)
|
|
; CHECK-NEXT: q3 = vcmp.gt(v9.uw,v6.uw)
|
|
; CHECK-NEXT: v20 = vmux(q0,v3,v2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v21.uw = vlsr(v6.uw,r2)
|
|
; CHECK-NEXT: v22 = vmux(q2,v3,v2)
|
|
; CHECK-NEXT: v25 = vmux(q1,v2,v3)
|
|
; CHECK-NEXT: v2 = vmux(q3,v2,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v4.w = vsub(v25.w,v4.w)
|
|
; CHECK-NEXT: v2.w = vsub(v2.w,v5.w)
|
|
; CHECK-NEXT: v23.w = vadd(v19.w,v20.w)
|
|
; CHECK-NEXT: v10.w = vadd(v21.w,v22.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v12.uw = vlsr(v7.uw,r2)
|
|
; CHECK-NEXT: v4.w = vadd(v4.w,v14.w)
|
|
; CHECK-NEXT: v2.w = vadd(v2.w,v14.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v24.uw = vlsr(v9.uw,r2)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v12.w,v19.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v13.uw = vlsr(v19.uw,r7)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v24.w,v21.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v26.uw = vlsr(v23.uw,r7)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v27.uw = vlsr(v10.uw,r7)
|
|
; CHECK-NEXT: v5 = vmux(q2,v26,v13)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v1.w,v3.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v28.uw = vlsr(v21.uw,r7)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v4.w = vasl(v4.w,r4)
|
|
; CHECK-NEXT: v6 = vmux(q3,v27,v28)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2.w = vasl(v2.w,r4)
|
|
; CHECK-NEXT: v29 = vor(v5,v4)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2 = vor(v6,v2)
|
|
; CHECK-NEXT: v31 = vmux(q3,v3,v29)
|
|
; CHECK-NEXT: vmem(r1+#0) = v31.new
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v30 = vmux(q2,v3,v2)
|
|
; CHECK-NEXT: jumpr r31
|
|
; CHECK-NEXT: vmem(r1+#1) = v30.new
|
|
; CHECK-NEXT: }
|
|
%v0 = load <64 x i16>, ptr %a0, align 128
|
|
%v1 = uitofp <64 x i16> %v0 to <64 x float>
|
|
store <64 x float> %v1, ptr %a1, align 128
|
|
ret void
|
|
}
|
|
|
|
; Widen input
|
|
define void @u16f32_1(ptr %a0, ptr %a1) #0 {
|
|
; CHECK-LABEL: u16f32_1:
|
|
; CHECK: .cfi_startproc
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vmem(r0+#0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r6 = #1
|
|
; CHECK-NEXT: r2 = #255
|
|
; CHECK-NEXT: v1:0.uw = vunpack(v0.uh)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1 = vsplat(r6)
|
|
; CHECK-NEXT: v4 = vsplat(r2)
|
|
; CHECK-NEXT: r3 = #512
|
|
; CHECK-NEXT: v2 = vxor(v2,v2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v5 = vsplat(r3)
|
|
; CHECK-NEXT: r5:4 = combine(##159,#8)
|
|
; CHECK-NEXT: v3.uw = vcl0(v0.uw)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7 = vsplat(r5)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v0.w,v2.w)
|
|
; CHECK-NEXT: v3.w = vadd(v3.w,v1.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.w = vasl(v0.w,v3.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v4.w = vadd(v6.w,v4.w)
|
|
; CHECK-NEXT: v5 = vand(v6,v5)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.uw = vlsr(v6.uw,r4)
|
|
; CHECK-NEXT: q0 = vcmp.gt(v6.uw,v4.uw)
|
|
; CHECK-NEXT: q1 = vcmp.eq(v5.w,v2.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r4 = #23
|
|
; CHECK-NEXT: v4.uw = vlsr(v4.uw,r4)
|
|
; CHECK-NEXT: v5 = vmux(q1,v2,v1)
|
|
; CHECK-NEXT: v1 = vmux(q0,v1,v2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.w = vsub(v1.w,v3.w)
|
|
; CHECK-NEXT: v29.w = vadd(v4.w,v5.w)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v6.w,v4.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v30.uw = vlsr(v4.uw,r6)
|
|
; CHECK-NEXT: v1.w = vadd(v1.w,v7.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v3.uw = vlsr(v29.uw,r6)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.w = vasl(v1.w,r4)
|
|
; CHECK-NEXT: v3 = vmux(q2,v3,v30)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v31 = vor(v3,v1)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vmux(q3,v2,v31)
|
|
; CHECK-NEXT: jumpr r31
|
|
; CHECK-NEXT: vmem(r1+#0) = v0.new
|
|
; CHECK-NEXT: }
|
|
%v0 = load <32 x i16>, ptr %a0, align 128
|
|
%v1 = uitofp <32 x i16> %v0 to <32 x float>
|
|
store <32 x float> %v1, ptr %a1, align 128
|
|
ret void
|
|
}
|
|
|
|
|
|
; u32 -> f16
|
|
; No widening
|
|
define void @u32f16_0(ptr %a0, ptr %a1) #0 {
|
|
; CHECK-LABEL: u32f16_0:
|
|
; CHECK: .cfi_startproc
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r3:2 = combine(#8,#1)
|
|
; CHECK-NEXT: r6 = #255
|
|
; CHECK-NEXT: v1.uw = vcl0(v0.uw)
|
|
; CHECK-NEXT: v0.cur = vmem(r0+#0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v4 = vsplat(r2)
|
|
; CHECK-NEXT: r4 = #512
|
|
; CHECK-NEXT: v3.uw = vcl0(v2.uw)
|
|
; CHECK-NEXT: v2.cur = vmem(r0+#1)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7 = vsplat(r4)
|
|
; CHECK-NEXT: v6 = vsplat(r6)
|
|
; CHECK-NEXT: v1.w = vadd(v1.w,v4.w)
|
|
; CHECK-NEXT: v3.w = vadd(v3.w,v4.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r4 = #159
|
|
; CHECK-NEXT: v9 = vxor(v9,v9)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v10 = vsplat(r4)
|
|
; CHECK-NEXT: v5.w = vasl(v0.w,v1.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v8.w = vasl(v2.w,v3.w)
|
|
; CHECK-NEXT: v11.w = vadd(v5.w,v6.w)
|
|
; CHECK-NEXT: v13 = vand(v5,v7)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.w = vadd(v8.w,v6.w)
|
|
; CHECK-NEXT: v7 = vand(v8,v7)
|
|
; CHECK-NEXT: q0 = vcmp.gt(v5.uw,v11.uw)
|
|
; CHECK-NEXT: q1 = vcmp.eq(v13.w,v9.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v27.uw = vlsr(v11.uw,r3)
|
|
; CHECK-NEXT: q3 = vcmp.gt(v8.uw,v6.uw)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v7.w,v9.w)
|
|
; CHECK-NEXT: v29 = vmux(q0,v4,v9)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.uw = vlsr(v6.uw,r3)
|
|
; CHECK-NEXT: v28 = vmux(q1,v9,v4)
|
|
; CHECK-NEXT: v30 = vmux(q3,v4,v9)
|
|
; CHECK-NEXT: v4 = vmux(q2,v9,v4)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.w = vsub(v29.w,v1.w)
|
|
; CHECK-NEXT: v7.w = vadd(v27.w,v28.w)
|
|
; CHECK-NEXT: v3.w = vsub(v30.w,v3.w)
|
|
; CHECK-NEXT: v4.w = vadd(v6.w,v4.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v12.uw = vlsr(v5.uw,r3)
|
|
; CHECK-NEXT: v1.w = vadd(v1.w,v10.w)
|
|
; CHECK-NEXT: v3.w = vadd(v3.w,v10.w)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v0.w,v9.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r3 = #23
|
|
; CHECK-NEXT: v14.uw = vlsr(v8.uw,r3)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v12.w,v27.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v5.uw = vlsr(v27.uw,r2)
|
|
; CHECK-NEXT: q1 = vcmp.eq(v14.w,v6.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7.uw = vlsr(v7.uw,r2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v4.uw = vlsr(v4.uw,r2)
|
|
; CHECK-NEXT: v5 = vmux(q3,v7,v5)
|
|
; CHECK-NEXT: q3 = vcmp.eq(v2.w,v9.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.uw = vlsr(v6.uw,r2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.w = vasl(v1.w,r3)
|
|
; CHECK-NEXT: v31 = vmux(q1,v4,v6)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v3.w = vasl(v3.w,r3)
|
|
; CHECK-NEXT: v1 = vor(v5,v1)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vor(v31,v3)
|
|
; CHECK-NEXT: v1 = vmux(q2,v9,v1)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vmux(q3,v9,v0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2.qf32 = vadd(v1.sf,v9.sf)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v3.qf32 = vadd(v0.sf,v9.sf)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0.hf = v3:2.qf32
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0.h = vdeal(v0.h)
|
|
; CHECK-NEXT: jumpr r31
|
|
; CHECK-NEXT: vmem(r1+#0) = v0.new
|
|
; CHECK-NEXT: }
|
|
%v0 = load <64 x i32>, ptr %a0, align 128
|
|
%v1 = uitofp <64 x i32> %v0 to <64 x half>
|
|
store <64 x half> %v1, ptr %a1, align 128
|
|
ret void
|
|
}
|
|
|
|
; Widen result
|
|
define void @u32f16_1(ptr %a0, ptr %a1) #0 {
|
|
; CHECK-LABEL: u32f16_1:
|
|
; CHECK: .cfi_startproc
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r3:2 = combine(##512,#1)
|
|
; CHECK-NEXT: v1.uw = vcl0(v0.uw)
|
|
; CHECK-NEXT: v0.cur = vmem(r0+#0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v3 = vsplat(r2)
|
|
; CHECK-NEXT: v5 = vsplat(r3)
|
|
; CHECK-NEXT: r6 = #255
|
|
; CHECK-NEXT: v2 = vxor(v2,v2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v4 = vsplat(r6)
|
|
; CHECK-NEXT: r5 = #8
|
|
; CHECK-NEXT: r4 = #159
|
|
; CHECK-NEXT: v1.w = vadd(v1.w,v3.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7 = vsplat(r4)
|
|
; CHECK-NEXT: r3 = #23
|
|
; CHECK-NEXT: q2 = vcmp.eq(v0.w,v2.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.w = vasl(v0.w,v1.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v4.w = vadd(v6.w,v4.w)
|
|
; CHECK-NEXT: v5 = vand(v6,v5)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.uw = vlsr(v6.uw,r5)
|
|
; CHECK-NEXT: q0 = vcmp.eq(v5.w,v2.w)
|
|
; CHECK-NEXT: q1 = vcmp.gt(v6.uw,v4.uw)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v4.uw = vlsr(v4.uw,r5)
|
|
; CHECK-NEXT: v5 = vmux(q0,v2,v3)
|
|
; CHECK-NEXT: v3 = vmux(q1,v3,v2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.w = vsub(v3.w,v1.w)
|
|
; CHECK-NEXT: v30.w = vadd(v4.w,v5.w)
|
|
; CHECK-NEXT: q1 = vcmp.eq(v6.w,v4.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v31.uw = vlsr(v4.uw,r2)
|
|
; CHECK-NEXT: v1.w = vadd(v1.w,v7.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r2 = #64
|
|
; CHECK-NEXT: v3.uw = vlsr(v30.uw,r2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.w = vasl(v1.w,r3)
|
|
; CHECK-NEXT: q3 = vsetq(r2)
|
|
; CHECK-NEXT: v3 = vmux(q1,v3,v31)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.qf32 = vadd(v2.sf,v2.sf)
|
|
; CHECK-NEXT: v0 = vor(v3,v1)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vmux(q2,v2,v0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0.qf32 = vadd(v0.sf,v2.sf)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0.hf = v1:0.qf32
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0.h = vdeal(v0.h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: jumpr r31
|
|
; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
|
|
; CHECK-NEXT: }
|
|
%v0 = load <32 x i32>, ptr %a0, align 128
|
|
%v1 = uitofp <32 x i32> %v0 to <32 x half>
|
|
store <32 x half> %v1, ptr %a1, align 128
|
|
ret void
|
|
}
|
|
|
|
; u32 -> f32
|
|
; No widening
|
|
define void @u32f32_0(ptr %a0, ptr %a1) #0 {
|
|
; CHECK-LABEL: u32f32_0:
|
|
; CHECK: .cfi_startproc
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r3:2 = combine(##512,#1)
|
|
; CHECK-NEXT: v1.uw = vcl0(v0.uw)
|
|
; CHECK-NEXT: v0.cur = vmem(r0+#0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2 = vsplat(r2)
|
|
; CHECK-NEXT: v5 = vsplat(r3)
|
|
; CHECK-NEXT: r6 = #255
|
|
; CHECK-NEXT: v3 = vxor(v3,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v4 = vsplat(r6)
|
|
; CHECK-NEXT: r5 = #8
|
|
; CHECK-NEXT: r4 = #159
|
|
; CHECK-NEXT: v1.w = vadd(v1.w,v2.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7 = vsplat(r4)
|
|
; CHECK-NEXT: r3 = #23
|
|
; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.w = vasl(v0.w,v1.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v4.w = vadd(v6.w,v4.w)
|
|
; CHECK-NEXT: v5 = vand(v6,v5)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.uw = vlsr(v6.uw,r5)
|
|
; CHECK-NEXT: q0 = vcmp.eq(v5.w,v3.w)
|
|
; CHECK-NEXT: q1 = vcmp.gt(v6.uw,v4.uw)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v4.uw = vlsr(v4.uw,r5)
|
|
; CHECK-NEXT: v5 = vmux(q0,v3,v2)
|
|
; CHECK-NEXT: v2 = vmux(q1,v2,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.w = vsub(v2.w,v1.w)
|
|
; CHECK-NEXT: v29.w = vadd(v4.w,v5.w)
|
|
; CHECK-NEXT: q2 = vcmp.eq(v6.w,v4.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v30.uw = vlsr(v4.uw,r2)
|
|
; CHECK-NEXT: v1.w = vadd(v1.w,v7.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2.uw = vlsr(v29.uw,r2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.w = vasl(v1.w,r3)
|
|
; CHECK-NEXT: v2 = vmux(q2,v2,v30)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v31 = vor(v2,v1)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vmux(q3,v3,v31)
|
|
; CHECK-NEXT: jumpr r31
|
|
; CHECK-NEXT: vmem(r1+#0) = v0.new
|
|
; CHECK-NEXT: }
|
|
%v0 = load <32 x i32>, ptr %a0, align 128
|
|
%v1 = uitofp <32 x i32> %v0 to <32 x float>
|
|
store <32 x float> %v1, ptr %a1, align 128
|
|
ret void
|
|
}
|
|
|
|
; Widen input and result
|
|
define void @u32f32_1(ptr %a0, ptr %a1) #0 {
|
|
; CHECK-LABEL: u32f32_1:
|
|
; CHECK: .cfi_startproc
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r3:2 = combine(##512,#1)
|
|
; CHECK-NEXT: v1.uw = vcl0(v0.uw)
|
|
; CHECK-NEXT: v0.cur = vmem(r0+#0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v2 = vsplat(r2)
|
|
; CHECK-NEXT: v5 = vsplat(r3)
|
|
; CHECK-NEXT: r6 = #255
|
|
; CHECK-NEXT: v3 = vxor(v3,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v4 = vsplat(r6)
|
|
; CHECK-NEXT: r5 = #8
|
|
; CHECK-NEXT: r4 = #159
|
|
; CHECK-NEXT: v1.w = vadd(v1.w,v2.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v7 = vsplat(r4)
|
|
; CHECK-NEXT: r3 = #23
|
|
; CHECK-NEXT: q2 = vcmp.eq(v0.w,v3.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.w = vasl(v0.w,v1.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v4.w = vadd(v6.w,v4.w)
|
|
; CHECK-NEXT: v5 = vand(v6,v5)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v6.uw = vlsr(v6.uw,r5)
|
|
; CHECK-NEXT: q0 = vcmp.eq(v5.w,v3.w)
|
|
; CHECK-NEXT: q1 = vcmp.gt(v6.uw,v4.uw)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v4.uw = vlsr(v4.uw,r5)
|
|
; CHECK-NEXT: v5 = vmux(q0,v3,v2)
|
|
; CHECK-NEXT: v2 = vmux(q1,v2,v3)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.w = vsub(v2.w,v1.w)
|
|
; CHECK-NEXT: v29.w = vadd(v4.w,v5.w)
|
|
; CHECK-NEXT: q1 = vcmp.eq(v6.w,v4.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v30.uw = vlsr(v4.uw,r2)
|
|
; CHECK-NEXT: v1.w = vadd(v1.w,v7.w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: r2 = #64
|
|
; CHECK-NEXT: v2.uw = vlsr(v29.uw,r2)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1.w = vasl(v1.w,r3)
|
|
; CHECK-NEXT: q3 = vsetq(r2)
|
|
; CHECK-NEXT: v2 = vmux(q1,v2,v30)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v31 = vor(v2,v1)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vmux(q2,v3,v31)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: jumpr r31
|
|
; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
|
|
; CHECK-NEXT: }
|
|
%v0 = load <16 x i32>, ptr %a0, align 128
|
|
%v1 = uitofp <16 x i32> %v0 to <16 x float>
|
|
store <16 x float> %v1, ptr %a1, align 128
|
|
ret void
|
|
}
|
|
|
|
|
|
attributes #0 = { "target-features"="+v68,+hvxv68,+hvx-length128b,+hvx-qfloat" }
|
|
|