forked from OSchip/llvm-project
[TargetLowering] Only demand a funnelshift's modulo amount bits
ISD::FSHL/FSHR shift amount values are guaranteed to act as a modulo amount, so for power-of-2 bitwidths we only need the lowest bits.
This commit is contained in:
parent
85462aefb5
commit
2b3b453a82
|
@ -1634,6 +1634,14 @@ bool TargetLowering::SimplifyDemandedBits(
|
|||
Known.One |= Known2.One;
|
||||
Known.Zero |= Known2.Zero;
|
||||
}
|
||||
|
||||
// For pow-2 bitwidths we only demand the bottom modulo amt bits.
|
||||
if (isPowerOf2_32(BitWidth)) {
|
||||
APInt DemandedAmtBits(Op2.getScalarValueSizeInBits(), BitWidth - 1);
|
||||
if (SimplifyDemandedBits(Op2, DemandedAmtBits, DemandedElts,
|
||||
Known2, TLO, Depth + 1))
|
||||
return true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ISD::ROTL:
|
||||
|
|
|
@ -16,13 +16,13 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %
|
|||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_and_b32 s2, s2, 31
|
||||
; SI-NEXT: s_sub_i32 s3, 32, s2
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; SI-NEXT: s_sub_i32 s1, 32, s2
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; SI-NEXT: s_and_b32 s1, s2, 31
|
||||
; SI-NEXT: v_alignbit_b32 v0, s0, v0, v1
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s0
|
||||
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
|
||||
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
|
@ -32,13 +32,13 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %
|
|||
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_and_b32 s2, s2, 31
|
||||
; VI-NEXT: s_sub_i32 s3, 32, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; VI-NEXT: s_sub_i32 s1, 32, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s1
|
||||
; VI-NEXT: s_and_b32 s1, s2, 31
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s3
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s0
|
||||
; VI-NEXT: v_alignbit_b32 v0, s0, v0, v2
|
||||
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
|
||||
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s5
|
||||
|
@ -50,13 +50,13 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %
|
|||
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_and_b32 s2, s2, 31
|
||||
; GFX9-NEXT: s_sub_i32 s3, 32, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: s_sub_i32 s1, 32, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9-NEXT: s_and_b32 s1, s2, 31
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v2
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
||||
|
@ -70,12 +70,12 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %
|
|||
; R600-NEXT: CF_END
|
||||
; R600-NEXT: PAD
|
||||
; R600-NEXT: ALU clause starting at 4:
|
||||
; R600-NEXT: AND_INT * T0.W, KC0[3].X, literal.x,
|
||||
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
|
||||
; R600-NEXT: SUB_INT * T1.W, literal.x, PV.W,
|
||||
; R600-NEXT: SUB_INT * T0.W, literal.x, KC0[3].X,
|
||||
; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
||||
; R600-NEXT: BIT_ALIGN_INT * T1.W, KC0[2].Z, KC0[2].W, PV.W,
|
||||
; R600-NEXT: CNDE_INT T0.X, T0.W, KC0[2].Z, PV.W,
|
||||
; R600-NEXT: BIT_ALIGN_INT T0.W, KC0[2].Z, KC0[2].W, PV.W,
|
||||
; R600-NEXT: AND_INT * T1.W, KC0[3].X, literal.x,
|
||||
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
|
||||
; R600-NEXT: CNDE_INT T0.X, PS, KC0[2].Z, PV.W,
|
||||
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
||||
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
entry:
|
||||
|
@ -149,15 +149,15 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
|
|||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s9
|
||||
; SI-NEXT: s_and_b32 s1, s1, 31
|
||||
; SI-NEXT: s_sub_i32 s10, 32, s1
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s10
|
||||
; SI-NEXT: s_and_b32 s0, s0, 31
|
||||
; SI-NEXT: s_and_b32 s1, s1, 31
|
||||
; SI-NEXT: v_alignbit_b32 v0, s3, v0, v1
|
||||
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; SI-NEXT: s_sub_i32 s1, 32, s0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
|
||||
; SI-NEXT: s_and_b32 s0, s0, 31
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s8
|
||||
; SI-NEXT: v_mov_b32_e32 v2, s1
|
||||
; SI-NEXT: v_alignbit_b32 v0, s2, v0, v2
|
||||
|
@ -175,15 +175,15 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
|
|||
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s7
|
||||
; VI-NEXT: s_sub_i32 s8, 32, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s8
|
||||
; VI-NEXT: s_and_b32 s1, s1, 31
|
||||
; VI-NEXT: s_sub_i32 s7, 32, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s7
|
||||
; VI-NEXT: s_and_b32 s0, s0, 31
|
||||
; VI-NEXT: v_alignbit_b32 v0, s5, v0, v1
|
||||
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s5
|
||||
; VI-NEXT: s_sub_i32 s1, 32, s0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
|
||||
; VI-NEXT: s_and_b32 s0, s0, 31
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s6
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s1
|
||||
; VI-NEXT: v_alignbit_b32 v0, s4, v0, v2
|
||||
|
@ -203,15 +203,15 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
|
|||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s7
|
||||
; GFX9-NEXT: s_sub_i32 s8, 32, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s8
|
||||
; GFX9-NEXT: s_and_b32 s1, s1, 31
|
||||
; GFX9-NEXT: s_sub_i32 s7, 32, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GFX9-NEXT: s_and_b32 s0, s0, 31
|
||||
; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, v1
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX9-NEXT: s_sub_i32 s1, 32, s0
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
|
||||
; GFX9-NEXT: s_and_b32 s0, s0, 31
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v2
|
||||
|
@ -225,25 +225,24 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
|
|||
;
|
||||
; R600-LABEL: fshl_v2i32:
|
||||
; R600: ; %bb.0: ; %entry
|
||||
; R600-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[]
|
||||
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
|
||||
; R600-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[]
|
||||
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1
|
||||
; R600-NEXT: CF_END
|
||||
; R600-NEXT: PAD
|
||||
; R600-NEXT: ALU clause starting at 4:
|
||||
; R600-NEXT: AND_INT * T0.W, KC0[4].X, literal.x,
|
||||
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
|
||||
; R600-NEXT: SUB_INT T1.W, literal.x, PV.W,
|
||||
; R600-NEXT: AND_INT * T2.W, KC0[3].W, literal.y,
|
||||
; R600-NEXT: 32(4.484155e-44), 31(4.344025e-44)
|
||||
; R600-NEXT: SUB_INT T0.Z, literal.x, PS,
|
||||
; R600-NEXT: BIT_ALIGN_INT T1.W, KC0[3].X, KC0[3].Z, PV.W,
|
||||
; R600-NEXT: SETE_INT * T0.W, T0.W, 0.0,
|
||||
; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
||||
; R600-NEXT: CNDE_INT T0.Y, PS, PV.W, KC0[3].X,
|
||||
; R600-NEXT: AND_INT T0.W, KC0[4].X, literal.x,
|
||||
; R600-NEXT: SUB_INT * T1.W, literal.y, KC0[4].X,
|
||||
; R600-NEXT: 31(4.344025e-44), 32(4.484155e-44)
|
||||
; R600-NEXT: AND_INT T0.Y, KC0[3].W, literal.x,
|
||||
; R600-NEXT: SUB_INT T0.Z, literal.y, KC0[3].W,
|
||||
; R600-NEXT: BIT_ALIGN_INT T1.W, KC0[3].X, KC0[3].Z, PS,
|
||||
; R600-NEXT: SETE_INT * T0.W, PV.W, 0.0,
|
||||
; R600-NEXT: 31(4.344025e-44), 32(4.484155e-44)
|
||||
; R600-NEXT: CNDE_INT T1.Y, PS, PV.W, KC0[3].X,
|
||||
; R600-NEXT: BIT_ALIGN_INT T0.W, KC0[2].W, KC0[3].Y, PV.Z,
|
||||
; R600-NEXT: SETE_INT * T1.W, T2.W, 0.0,
|
||||
; R600-NEXT: CNDE_INT T0.X, PS, PV.W, KC0[2].W,
|
||||
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
||||
; R600-NEXT: SETE_INT * T1.W, PV.Y, 0.0,
|
||||
; R600-NEXT: CNDE_INT T1.X, PS, PV.W, KC0[2].W,
|
||||
; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
|
||||
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
entry:
|
||||
%0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
|
||||
|
@ -327,31 +326,31 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
|
|||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s15
|
||||
; SI-NEXT: s_and_b32 s3, s3, 31
|
||||
; SI-NEXT: s_sub_i32 s16, 32, s3
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s16
|
||||
; SI-NEXT: s_and_b32 s2, s2, 31
|
||||
; SI-NEXT: s_and_b32 s3, s3, 31
|
||||
; SI-NEXT: v_alignbit_b32 v0, s11, v0, v1
|
||||
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s11
|
||||
; SI-NEXT: s_sub_i32 s3, 32, s2
|
||||
; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
|
||||
; SI-NEXT: s_and_b32 s2, s2, 31
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s14
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; SI-NEXT: s_and_b32 s1, s1, 31
|
||||
; SI-NEXT: v_alignbit_b32 v0, s10, v0, v1
|
||||
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s10
|
||||
; SI-NEXT: s_sub_i32 s2, 32, s1
|
||||
; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
|
||||
; SI-NEXT: s_and_b32 s1, s1, 31
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s13
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s2
|
||||
; SI-NEXT: s_and_b32 s0, s0, 31
|
||||
; SI-NEXT: v_alignbit_b32 v0, s9, v0, v1
|
||||
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s9
|
||||
; SI-NEXT: s_sub_i32 s1, 32, s0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
|
||||
; SI-NEXT: s_and_b32 s0, s0, 31
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s12
|
||||
; SI-NEXT: v_mov_b32_e32 v4, s1
|
||||
; SI-NEXT: v_alignbit_b32 v0, s8, v0, v4
|
||||
|
@ -369,31 +368,31 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
|
|||
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s11
|
||||
; VI-NEXT: s_sub_i32 s14, 32, s3
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s14
|
||||
; VI-NEXT: s_and_b32 s3, s3, 31
|
||||
; VI-NEXT: s_sub_i32 s11, 32, s3
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s11
|
||||
; VI-NEXT: s_and_b32 s2, s2, 31
|
||||
; VI-NEXT: v_alignbit_b32 v0, s7, v0, v1
|
||||
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s7
|
||||
; VI-NEXT: s_sub_i32 s3, 32, s2
|
||||
; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
|
||||
; VI-NEXT: s_and_b32 s2, s2, 31
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s10
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: s_and_b32 s1, s1, 31
|
||||
; VI-NEXT: v_alignbit_b32 v0, s6, v0, v1
|
||||
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s6
|
||||
; VI-NEXT: s_sub_i32 s2, 32, s1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
|
||||
; VI-NEXT: s_and_b32 s1, s1, 31
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s9
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s2
|
||||
; VI-NEXT: s_and_b32 s0, s0, 31
|
||||
; VI-NEXT: v_alignbit_b32 v0, s5, v0, v1
|
||||
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s5
|
||||
; VI-NEXT: s_sub_i32 s1, 32, s0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
|
||||
; VI-NEXT: s_and_b32 s0, s0, 31
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s8
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s1
|
||||
; VI-NEXT: v_alignbit_b32 v0, s4, v0, v4
|
||||
|
@ -413,31 +412,31 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
|
|||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s11
|
||||
; GFX9-NEXT: s_sub_i32 s14, 32, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s14
|
||||
; GFX9-NEXT: s_and_b32 s3, s3, 31
|
||||
; GFX9-NEXT: s_sub_i32 s11, 32, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s11
|
||||
; GFX9-NEXT: s_and_b32 s2, s2, 31
|
||||
; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, v1
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GFX9-NEXT: s_sub_i32 s3, 32, s2
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
|
||||
; GFX9-NEXT: s_and_b32 s2, s2, 31
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s10
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: s_and_b32 s1, s1, 31
|
||||
; GFX9-NEXT: v_alignbit_b32 v0, s6, v0, v1
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s6
|
||||
; GFX9-NEXT: s_sub_i32 s2, 32, s1
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
|
||||
; GFX9-NEXT: s_and_b32 s1, s1, 31
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s9
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX9-NEXT: s_and_b32 s0, s0, 31
|
||||
; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, v1
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX9-NEXT: s_sub_i32 s1, 32, s0
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
|
||||
; GFX9-NEXT: s_and_b32 s0, s0, 31
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s1
|
||||
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v4
|
||||
|
@ -451,38 +450,36 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
|
|||
;
|
||||
; R600-LABEL: fshl_v4i32:
|
||||
; R600: ; %bb.0: ; %entry
|
||||
; R600-NEXT: ALU 27, @4, KC0[CB0:0-32], KC1[]
|
||||
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
|
||||
; R600-NEXT: ALU 25, @4, KC0[CB0:0-32], KC1[]
|
||||
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
|
||||
; R600-NEXT: CF_END
|
||||
; R600-NEXT: PAD
|
||||
; R600-NEXT: ALU clause starting at 4:
|
||||
; R600-NEXT: AND_INT T0.W, KC0[6].X, literal.x,
|
||||
; R600-NEXT: AND_INT * T1.W, KC0[5].W, literal.x,
|
||||
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
|
||||
; R600-NEXT: SUB_INT * T2.W, literal.x, PV.W,
|
||||
; R600-NEXT: SUB_INT * T0.W, literal.x, KC0[6].X,
|
||||
; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
||||
; R600-NEXT: AND_INT T0.X, KC0[5].Y, literal.x,
|
||||
; R600-NEXT: BIT_ALIGN_INT T0.Y, KC0[4].X, KC0[5].X, PV.W,
|
||||
; R600-NEXT: SETE_INT T0.Z, T0.W, 0.0,
|
||||
; R600-NEXT: SUB_INT * T0.W, literal.y, T1.W,
|
||||
; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, PV.W,
|
||||
; R600-NEXT: AND_INT * T1.W, KC0[6].X, literal.x,
|
||||
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
|
||||
; R600-NEXT: AND_INT T0.X, KC0[5].Z, literal.x,
|
||||
; R600-NEXT: SUB_INT T0.Y, literal.y, KC0[5].Z,
|
||||
; R600-NEXT: SETE_INT T0.Z, PV.W, 0.0,
|
||||
; R600-NEXT: SUB_INT T1.W, literal.y, KC0[5].W,
|
||||
; R600-NEXT: AND_INT * T2.W, KC0[5].W, literal.x,
|
||||
; R600-NEXT: 31(4.344025e-44), 32(4.484155e-44)
|
||||
; R600-NEXT: AND_INT * T2.W, KC0[5].Z, literal.x,
|
||||
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
|
||||
; R600-NEXT: SUB_INT T1.Y, literal.x, PV.W,
|
||||
; R600-NEXT: BIT_ALIGN_INT T1.Z, KC0[3].W, KC0[4].W, T0.W,
|
||||
; R600-NEXT: SETE_INT * T0.W, T1.W, 0.0,
|
||||
; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
||||
; R600-NEXT: CNDE_INT * T1.W, T0.Z, T0.Y, KC0[4].X,
|
||||
; R600-NEXT: CNDE_INT T1.Z, T0.W, T1.Z, KC0[3].W,
|
||||
; R600-NEXT: BIT_ALIGN_INT T0.W, KC0[3].Z, KC0[4].Z, T1.Y,
|
||||
; R600-NEXT: SETE_INT * T2.W, T2.W, 0.0,
|
||||
; R600-NEXT: CNDE_INT T1.Y, PS, PV.W, KC0[3].Z,
|
||||
; R600-NEXT: SUB_INT * T0.W, literal.x, T0.X,
|
||||
; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
||||
; R600-NEXT: BIT_ALIGN_INT T0.W, KC0[3].Y, KC0[4].Y, PV.W,
|
||||
; R600-NEXT: SETE_INT T1.Z, PS, 0.0,
|
||||
; R600-NEXT: BIT_ALIGN_INT * T1.W, KC0[3].W, KC0[4].W, PV.W,
|
||||
; R600-NEXT: CNDE_INT * T0.W, T0.Z, T0.W, KC0[4].X,
|
||||
; R600-NEXT: CNDE_INT T0.Z, T1.Z, T1.W, KC0[3].W,
|
||||
; R600-NEXT: BIT_ALIGN_INT T1.W, KC0[3].Z, KC0[4].Z, T0.Y,
|
||||
; R600-NEXT: SETE_INT * T2.W, T0.X, 0.0,
|
||||
; R600-NEXT: CNDE_INT T1.X, PS, PV.W, KC0[3].Y,
|
||||
; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
|
||||
; R600-NEXT: CNDE_INT T0.Y, PS, PV.W, KC0[3].Z,
|
||||
; R600-NEXT: AND_INT T1.W, KC0[5].Y, literal.x,
|
||||
; R600-NEXT: SUB_INT * T2.W, literal.y, KC0[5].Y,
|
||||
; R600-NEXT: 31(4.344025e-44), 32(4.484155e-44)
|
||||
; R600-NEXT: BIT_ALIGN_INT T2.W, KC0[3].Y, KC0[4].Y, PS,
|
||||
; R600-NEXT: SETE_INT * T1.W, PV.W, 0.0,
|
||||
; R600-NEXT: CNDE_INT T0.X, PS, PV.W, KC0[3].Y,
|
||||
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
||||
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
entry:
|
||||
%0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z)
|
||||
|
|
|
@ -88,9 +88,9 @@ declare i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %c)
|
|||
define i32 @n6_fshl(i32 %x, i32 %y, i8 %shamt) nounwind {
|
||||
; X86-LABEL: n6_fshl:
|
||||
; X86: # %bb.0:
|
||||
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
|
||||
; X86-NEXT: shldl %cl, %edx, %eax
|
||||
; X86-NEXT: retl
|
||||
;
|
||||
|
@ -108,9 +108,9 @@ define i32 @n6_fshl(i32 %x, i32 %y, i8 %shamt) nounwind {
|
|||
define i32 @n7_fshr(i32 %x, i32 %y, i8 %shamt) nounwind {
|
||||
; X86-LABEL: n7_fshr:
|
||||
; X86: # %bb.0:
|
||||
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
|
||||
; X86-NEXT: shrdl %cl, %edx, %eax
|
||||
; X86-NEXT: retl
|
||||
;
|
||||
|
|
Loading…
Reference in New Issue