[DAG][AMDGPU][X86] Add SimplifyMultipleUseDemandedBits handling for SIGN/ZERO_EXTEND + SIGN/ZERO_EXTEND_VECTOR_INREG

Peek through multiple use ops like we already do for ANY_EXTEND/ANY_EXTEND_VECTOR_INREG

Differential Revision: https://reviews.llvm.org/D84863
This commit is contained in:
Simon Pilgrim 2020-07-29 17:33:56 +01:00
parent 1d51dc38d8
commit fdc902774e
11 changed files with 214 additions and 204 deletions

View File

@ -1858,6 +1858,11 @@ bool TargetLowering::SimplifyDemandedBits(
assert(!Known.hasConflict() && "Bits known to be one AND zero?");
assert(Known.getBitWidth() == InBits && "Src width has changed?");
Known = Known.zext(BitWidth);
// Attempt to avoid multi-use ops if we don't need anything from them.
if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
Src, InDemandedBits, InDemandedElts, TLO.DAG, Depth + 1))
return TLO.CombineTo(Op, TLO.DAG.getNode(Op.getOpcode(), dl, VT, NewSrc));
break;
}
case ISD::SIGN_EXTEND:
@ -1906,6 +1911,11 @@ bool TargetLowering::SimplifyDemandedBits(
if (!TLO.LegalOperations() || isOperationLegal(Opc, VT))
return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT, Src));
}
// Attempt to avoid multi-use ops if we don't need anything from them.
if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
Src, InDemandedBits, InDemandedElts, TLO.DAG, Depth + 1))
return TLO.CombineTo(Op, TLO.DAG.getNode(Op.getOpcode(), dl, VT, NewSrc));
break;
}
case ISD::ANY_EXTEND:

View File

@ -493,11 +493,11 @@ define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) {
; SI-NEXT: v_bfi_b32 v1, s4, v1, v3
; SI-NEXT: v_bfi_b32 v0, s4, v0, v4
; SI-NEXT: v_bfi_b32 v2, s4, v2, v5
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16
; SI-NEXT: v_or_b32_e32 v0, v0, v3
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bswap_v3i16:
@ -515,27 +515,27 @@ define <4 x i16> @v_bswap_v4i16(<4 x i16> %src) {
; SI-LABEL: v_bswap_v4i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_alignbit_b32 v4, v3, v3, 8
; SI-NEXT: v_alignbit_b32 v3, v3, v3, 24
; SI-NEXT: v_alignbit_b32 v4, v1, v1, 8
; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24
; SI-NEXT: s_mov_b32 s4, 0xff00ff
; SI-NEXT: s_mov_b32 s5, 0xffff0000
; SI-NEXT: v_alignbit_b32 v5, v2, v2, 8
; SI-NEXT: v_alignbit_b32 v2, v2, v2, 24
; SI-NEXT: v_alignbit_b32 v6, v1, v1, 8
; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24
; SI-NEXT: v_alignbit_b32 v7, v0, v0, 8
; SI-NEXT: v_alignbit_b32 v5, v0, v0, 8
; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24
; SI-NEXT: v_bfi_b32 v3, s4, v3, v4
; SI-NEXT: v_bfi_b32 v2, s4, v2, v5
; SI-NEXT: v_bfi_b32 v1, s4, v1, v6
; SI-NEXT: v_bfi_b32 v0, s4, v0, v7
; SI-NEXT: v_alignbit_b32 v6, v3, v3, 8
; SI-NEXT: v_alignbit_b32 v3, v3, v3, 24
; SI-NEXT: v_alignbit_b32 v7, v2, v2, 8
; SI-NEXT: v_alignbit_b32 v2, v2, v2, 24
; SI-NEXT: v_bfi_b32 v1, s4, v1, v4
; SI-NEXT: v_bfi_b32 v0, s4, v0, v5
; SI-NEXT: v_bfi_b32 v3, s4, v3, v6
; SI-NEXT: v_bfi_b32 v2, s4, v2, v7
; SI-NEXT: v_and_b32_e32 v4, s5, v1
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_and_b32_e32 v3, s5, v3
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; SI-NEXT: v_and_b32_e32 v1, s5, v1
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v0, v0, v4
; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; SI-NEXT: s_setpc_b64 s[30:31]
;

View File

@ -767,21 +767,21 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2
; SI-NEXT: v_lshl_b32_e32 v0, v0, v7
; SI-NEXT: v_or_b32_e32 v0, v0, v6
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
; SI-NEXT: v_mov_b32_e32 v9, 0xffff
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; SI-NEXT: v_and_b32_e32 v3, 15, v8
; SI-NEXT: v_sub_i32_e32 v6, vcc, 16, v3
; SI-NEXT: v_and_b32_e32 v10, s4, v5
; SI-NEXT: v_lshr_b32_e32 v4, v10, v3
; SI-NEXT: v_lshl_b32_e32 v2, v2, v6
; SI-NEXT: v_mov_b32_e32 v9, 0xffff
; SI-NEXT: v_or_b32_e32 v2, v2, v4
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; SI-NEXT: v_cndmask_b32_e32 v3, v2, v5, vcc
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_and_b32_e32 v0, v9, v0
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 15, v8
; SI-NEXT: v_sub_i32_e32 v4, vcc, 16, v1
; SI-NEXT: v_and_b32_e32 v10, s4, v5
; SI-NEXT: v_lshr_b32_e32 v3, v10, v1
; SI-NEXT: v_lshl_b32_e32 v2, v2, v4
; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; SI-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc
; SI-NEXT: v_and_b32_e32 v2, v9, v1
; SI-NEXT: v_alignbit_b32 v1, v1, v0, 16
; SI-NEXT: v_and_b32_e32 v2, v9, v3
; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fshr_v3i16:
@ -865,46 +865,46 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, 0xffff
; SI-NEXT: v_and_b32_e32 v11, 15, v11
; SI-NEXT: v_and_b32_e32 v16, s4, v7
; SI-NEXT: v_sub_i32_e32 v17, vcc, 16, v11
; SI-NEXT: v_lshr_b32_e32 v16, v16, v11
; SI-NEXT: v_lshl_b32_e32 v3, v3, v17
; SI-NEXT: v_or_b32_e32 v3, v3, v16
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11
; SI-NEXT: v_and_b32_e32 v9, 15, v9
; SI-NEXT: v_and_b32_e32 v16, s4, v5
; SI-NEXT: v_sub_i32_e32 v17, vcc, 16, v9
; SI-NEXT: v_lshr_b32_e32 v16, v16, v9
; SI-NEXT: v_lshl_b32_e32 v1, v1, v17
; SI-NEXT: v_or_b32_e32 v1, v1, v16
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
; SI-NEXT: v_and_b32_e32 v5, 15, v8
; SI-NEXT: v_sub_i32_e32 v9, vcc, 16, v5
; SI-NEXT: v_and_b32_e32 v15, s4, v4
; SI-NEXT: v_lshr_b32_e32 v8, v15, v5
; SI-NEXT: v_lshl_b32_e32 v0, v0, v9
; SI-NEXT: v_or_b32_e32 v0, v0, v8
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; SI-NEXT: v_and_b32_e32 v4, 15, v11
; SI-NEXT: v_sub_i32_e32 v8, vcc, 16, v4
; SI-NEXT: v_and_b32_e32 v14, s4, v7
; SI-NEXT: v_lshr_b32_e32 v5, v14, v4
; SI-NEXT: v_lshl_b32_e32 v3, v3, v8
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
; SI-NEXT: v_or_b32_e32 v3, v3, v5
; SI-NEXT: v_and_b32_e32 v4, 15, v10
; SI-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
; SI-NEXT: v_and_b32_e32 v7, 15, v10
; SI-NEXT: v_sub_i32_e32 v11, vcc, 16, v7
; SI-NEXT: v_and_b32_e32 v15, s4, v6
; SI-NEXT: v_lshr_b32_e32 v10, v15, v7
; SI-NEXT: v_lshl_b32_e32 v2, v2, v11
; SI-NEXT: v_or_b32_e32 v2, v2, v10
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
; SI-NEXT: v_sub_i32_e32 v7, vcc, 16, v4
; SI-NEXT: v_and_b32_e32 v13, s4, v6
; SI-NEXT: v_lshr_b32_e32 v5, v13, v4
; SI-NEXT: v_lshl_b32_e32 v2, v2, v7
; SI-NEXT: v_or_b32_e32 v2, v2, v5
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
; SI-NEXT: v_mov_b32_e32 v12, 0xffff
; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_and_b32_e32 v2, v12, v2
; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v3, 15, v9
; SI-NEXT: v_sub_i32_e32 v7, vcc, 16, v3
; SI-NEXT: v_and_b32_e32 v14, s4, v5
; SI-NEXT: v_lshr_b32_e32 v6, v14, v3
; SI-NEXT: v_lshl_b32_e32 v1, v1, v7
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; SI-NEXT: v_or_b32_e32 v1, v1, v6
; SI-NEXT: v_and_b32_e32 v3, 15, v8
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
; SI-NEXT: v_sub_i32_e32 v6, vcc, 16, v3
; SI-NEXT: v_and_b32_e32 v13, s4, v4
; SI-NEXT: v_lshr_b32_e32 v5, v13, v3
; SI-NEXT: v_lshl_b32_e32 v0, v0, v6
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; SI-NEXT: v_or_b32_e32 v0, v0, v5
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_and_b32_e32 v0, v12, v0
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; SI-NEXT: s_setpc_b64 s[30:31]
;

View File

@ -1470,21 +1470,21 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX7-NEXT: s_bfe_i32 s15, s5, 0x40018
; GFX7-NEXT: s_bfe_i32 s16, s5, 0x40014
; GFX7-NEXT: s_bfe_i32 s17, s5, 0x40010
; GFX7-NEXT: s_bfe_i32 s18, s5, 0x40000
; GFX7-NEXT: s_bfe_i32 s19, s5, 0x40004
; GFX7-NEXT: s_bfe_i32 s20, s5, 0x40008
; GFX7-NEXT: s_bfe_i32 s18, s5, 0x40008
; GFX7-NEXT: s_bfe_i32 s19, s5, 0x4000c
; GFX7-NEXT: s_bfe_i32 s20, s5, 0x40000
; GFX7-NEXT: s_ashr_i32 s14, s5, 28
; GFX7-NEXT: s_bfe_i32 s5, s5, 0x4000c
; GFX7-NEXT: s_bfe_i32 s5, s5, 0x40004
; GFX7-NEXT: s_bfe_i32 s7, s4, 0x40018
; GFX7-NEXT: s_bfe_i32 s9, s4, 0x40014
; GFX7-NEXT: s_bfe_i32 s10, s4, 0x40010
; GFX7-NEXT: s_bfe_i32 s11, s4, 0x40000
; GFX7-NEXT: s_bfe_i32 s11, s4, 0x40008
; GFX7-NEXT: v_mov_b32_e32 v4, s18
; GFX7-NEXT: s_bfe_i32 s12, s4, 0x40004
; GFX7-NEXT: s_bfe_i32 s12, s4, 0x4000c
; GFX7-NEXT: v_mov_b32_e32 v3, s19
; GFX7-NEXT: s_bfe_i32 s13, s4, 0x40008
; GFX7-NEXT: s_bfe_i32 s13, s4, 0x40000
; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: s_bfe_i32 s4, s4, 0x4000c
; GFX7-NEXT: s_bfe_i32 s4, s4, 0x40004
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mul_i32_i24_e32 v1, s4, v1
; GFX7-NEXT: v_mul_i32_i24_e32 v2, s13, v2
@ -1494,17 +1494,17 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX7-NEXT: v_and_b32_e32 v2, s8, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_and_b32_e32 v4, s8, v4
; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
; GFX7-NEXT: v_or_b32_e32 v2, v4, v3
; GFX7-NEXT: v_alignbit_b32 v3, v1, v2, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX7-NEXT: v_or_b32_e32 v3, v4, v3
; GFX7-NEXT: v_or_b32_e32 v2, v2, v1
; GFX7-NEXT: v_alignbit_b32 v1, v3, v1, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX7-NEXT: v_mov_b32_e32 v5, s17
; GFX7-NEXT: v_mov_b32_e32 v6, s16
; GFX7-NEXT: v_mov_b32_e32 v7, s15
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0
; GFX7-NEXT: v_mad_i32_i24 v0, s10, v5, v0
; GFX7-NEXT: v_mad_i32_i24 v0, s9, v6, v0

View File

@ -1988,17 +1988,17 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_bfe_u32 s10, s4, 0x40004
; GFX7-NEXT: s_bfe_u32 s17, s5, 0x40004
; GFX7-NEXT: s_bfe_u32 s19, s5, 0x4000c
; GFX7-NEXT: s_bfe_u32 s10, s4, 0x4000c
; GFX7-NEXT: s_bfe_u32 s17, s5, 0x4000c
; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40004
; GFX7-NEXT: v_mov_b32_e32 v4, s17
; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018
; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40014
; GFX7-NEXT: s_bfe_u32 s16, s5, 0x40010
; GFX7-NEXT: s_and_b32 s18, s5, 15
; GFX7-NEXT: s_bfe_u32 s18, s5, 0x40008
; GFX7-NEXT: s_lshr_b32 s13, s5, 28
; GFX7-NEXT: s_bfe_u32 s5, s5, 0x40008
; GFX7-NEXT: s_bfe_u32 s12, s4, 0x4000c
; GFX7-NEXT: s_and_b32 s5, s5, 15
; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40004
; GFX7-NEXT: v_mov_b32_e32 v2, s19
; GFX7-NEXT: v_mul_u32_u24_e32 v2, s12, v2
; GFX7-NEXT: v_mul_u32_u24_e32 v4, s10, v4
@ -2006,25 +2006,25 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40018
; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40014
; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40010
; GFX7-NEXT: s_and_b32 s11, s4, 15
; GFX7-NEXT: s_bfe_u32 s11, s4, 0x40008
; GFX7-NEXT: v_mov_b32_e32 v3, s18
; GFX7-NEXT: s_bfe_u32 s4, s4, 0x40008
; GFX7-NEXT: s_and_b32 s4, s4, 15
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mul_u32_u24_e32 v1, s4, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_u32_u24_e32 v3, s11, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_or_b32_e32 v3, v3, v4
; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
; GFX7-NEXT: v_or_b32_e32 v2, v3, v4
; GFX7-NEXT: v_alignbit_b32 v3, v1, v2, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX7-NEXT: v_mov_b32_e32 v5, s16
; GFX7-NEXT: v_mov_b32_e32 v6, s15
; GFX7-NEXT: v_mov_b32_e32 v7, s14
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v2, v0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0
; GFX7-NEXT: v_mad_u32_u24 v0, s9, v5, v0
; GFX7-NEXT: v_mad_u32_u24 v0, s8, v6, v0

View File

@ -170,24 +170,24 @@ define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4
; GFX6-NEXT: s_movk_i32 s4, 0x7fff
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3
; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; GFX6-NEXT: v_min_i32_e32 v1, s4, v1
; GFX6-NEXT: s_movk_i32 s5, 0x8000
; GFX6-NEXT: v_min_i32_e32 v0, s4, v0
; GFX6-NEXT: v_max_i32_e32 v1, s5, v1
; GFX6-NEXT: v_max_i32_e32 v0, s5, v0
; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX6-NEXT: v_min_i32_e32 v2, s4, v2
; GFX6-NEXT: v_max_i32_e32 v3, s5, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v5
; GFX6-NEXT: v_min_i32_e32 v1, s4, v1
; GFX6-NEXT: v_max_i32_e32 v1, s5, v1
; GFX6-NEXT: v_or_b32_e32 v2, 0xffff0000, v1
; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16
; GFX6-NEXT: v_or_b32_e32 v2, 0xffff0000, v3
; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_saddsat_v3i16:

View File

@ -170,25 +170,25 @@ define <3 x i16> @v_ssubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4
; GFX6-NEXT: s_movk_i32 s4, 0x7fff
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5
; GFX6-NEXT: v_min_i32_e32 v1, s4, v1
; GFX6-NEXT: s_movk_i32 s5, 0x8000
; GFX6-NEXT: v_min_i32_e32 v0, s4, v0
; GFX6-NEXT: v_max_i32_e32 v1, s5, v1
; GFX6-NEXT: v_min_i32_e32 v2, s4, v2
; GFX6-NEXT: v_max_i32_e32 v0, s5, v0
; GFX6-NEXT: s_mov_b32 s6, 0xffff
; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX6-NEXT: v_max_i32_e32 v3, s5, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_and_b32_e32 v0, s6, v0
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v2, v5
; GFX6-NEXT: v_min_i32_e32 v1, s4, v1
; GFX6-NEXT: v_max_i32_e32 v1, s5, v1
; GFX6-NEXT: v_and_b32_e32 v2, s6, v1
; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16
; GFX6-NEXT: v_and_b32_e32 v2, s6, v3
; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ssubsat_v3i16:

View File

@ -128,19 +128,19 @@ define <3 x i16> @v_uaddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
; GFX6-NEXT: v_and_b32_e32 v4, s4, v4
; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4
; GFX6-NEXT: v_and_b32_e32 v5, s4, v5
; GFX6-NEXT: v_and_b32_e32 v2, s4, v2
; GFX6-NEXT: v_and_b32_e32 v3, s4, v3
; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3
; GFX6-NEXT: v_min_u32_e32 v1, s4, v1
; GFX6-NEXT: v_and_b32_e32 v5, s4, v5
; GFX6-NEXT: v_and_b32_e32 v2, s4, v2
; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; GFX6-NEXT: v_min_u32_e32 v3, s4, v2
; GFX6-NEXT: v_min_u32_e32 v0, s4, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v5
; GFX6-NEXT: v_min_u32_e32 v1, s4, v1
; GFX6-NEXT: v_or_b32_e32 v2, 0xffff0000, v1
; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16
; GFX6-NEXT: v_or_b32_e32 v2, 0xffff0000, v3
; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uaddsat_v3i16:

View File

@ -131,17 +131,17 @@ define <3 x i16> @v_usubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
; GFX6-NEXT: v_max_u32_e32 v1, v1, v8
; GFX6-NEXT: v_max_u32_e32 v0, v0, v7
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
; GFX6-NEXT: v_and_b32_e32 v6, s4, v5
; GFX6-NEXT: v_and_b32_e32 v2, s4, v2
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
; GFX6-NEXT: v_max_u32_e32 v2, v2, v6
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v2, v5
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: v_max_u32_e32 v1, v2, v6
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
; GFX6-NEXT: v_and_b32_e32 v2, s4, v1
; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16
; GFX6-NEXT: v_and_b32_e32 v2, s4, v3
; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_usubsat_v3i16:

View File

@ -1947,18 +1947,18 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
;
; AVX512F-LABEL: splatvar_funnnel_v16i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512F-NEXT: vpsubb %xmm2, %xmm3, %xmm3
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512F-NEXT: vpsrld %xmm3, %zmm1, %zmm1
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpslld %xmm4, %zmm3, %zmm3
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512F-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512F-NEXT: vpsrld %xmm4, %zmm1, %zmm1
; AVX512F-NEXT: vpord %zmm1, %zmm3, %zmm1
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
; AVX512F-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
@ -1967,18 +1967,18 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
;
; AVX512VL-LABEL: splatvar_funnnel_v16i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VL-NEXT: vpsubb %xmm2, %xmm3, %xmm3
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512VL-NEXT: vpsrld %xmm3, %zmm1, %zmm1
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpslld %xmm4, %zmm3, %zmm3
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512VL-NEXT: vpsrld %xmm4, %zmm1, %zmm1
; AVX512VL-NEXT: vpord %zmm1, %zmm3, %zmm1
; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1
; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
; AVX512VL-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
@ -1988,18 +1988,18 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
; AVX512BW-LABEL: splatvar_funnnel_v16i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512BW-NEXT: vpsubb %xmm2, %xmm3, %xmm3
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512BW-NEXT: vpsrlw %xmm3, %ymm1, %ymm1
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpsllw %xmm4, %ymm3, %ymm3
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512BW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
@ -2009,18 +2009,18 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8:
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm3, %xmm3
; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512VBMI2-NEXT: vpsrlw %xmm3, %ymm1, %ymm1
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VBMI2-NEXT: vpsllw %xmm4, %ymm3, %ymm3
; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512VBMI2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512VBMI2-NEXT: vpmovwb %zmm1, %ymm1
; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
@ -2029,18 +2029,18 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
;
; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm3, %xmm3
; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm1, %ymm1
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLBW-NEXT: vpsllw %xmm4, %ymm3, %ymm3
; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512VLBW-NEXT: vpmovwb %ymm1, %xmm1
; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0
@ -2049,18 +2049,18 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8:
; AVX512VLVBMI2: # %bb.0:
; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm3, %xmm3
; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %ymm1, %ymm1
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %ymm3, %ymm3
; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512VLVBMI2-NEXT: vpmovwb %ymm1, %xmm1
; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512VLVBMI2-NEXT: vptestnmb %xmm2, %xmm2, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0

View File

@ -1967,18 +1967,18 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm4
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpsrld %xmm5, %zmm3, %zmm3
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512F-NEXT: vpsubb %xmm2, %xmm5, %xmm2
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpsrld %xmm4, %zmm3, %zmm3
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512F-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512F-NEXT: vpslld %xmm2, %zmm0, %zmm0
; AVX512F-NEXT: vpslld %xmm4, %zmm0, %zmm0
; AVX512F-NEXT: vpord %zmm3, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm4, %xmm2
; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@ -1987,18 +1987,18 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm4
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpsrld %xmm5, %zmm3, %zmm3
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VL-NEXT: vpsubb %xmm2, %xmm5, %xmm2
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpsrld %xmm4, %zmm3, %zmm3
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512VL-NEXT: vpslld %xmm2, %zmm0, %zmm0
; AVX512VL-NEXT: vpslld %xmm4, %zmm0, %zmm0
; AVX512VL-NEXT: vpord %zmm3, %zmm0, %zmm0
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VL-NEXT: vpcmpeqb %xmm2, %xmm4, %xmm2
; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
@ -2008,17 +2008,17 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm4
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpsrlw %xmm5, %ymm3, %ymm3
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512BW-NEXT: vpsubb %xmm2, %xmm5, %xmm2
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpsrlw %xmm4, %ymm3, %ymm3
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512BW-NEXT: vpsllw %xmm2, %ymm0, %ymm0
; AVX512BW-NEXT: vpsllw %xmm4, %ymm0, %ymm0
; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: vptestnmb %zmm4, %zmm4, %k1
; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@ -2029,17 +2029,17 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm4
; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VBMI2-NEXT: vpsrlw %xmm5, %ymm3, %ymm3
; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm5, %xmm2
; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VBMI2-NEXT: vpsrlw %xmm4, %ymm3, %ymm3
; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512VBMI2-NEXT: vpsllw %xmm2, %ymm0, %ymm0
; AVX512VBMI2-NEXT: vpsllw %xmm4, %ymm0, %ymm0
; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
; AVX512VBMI2-NEXT: vptestnmb %zmm4, %zmm4, %k1
; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI2-NEXT: vzeroupper
@ -2049,17 +2049,17 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm4
; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLBW-NEXT: vpsrlw %xmm5, %ymm3, %ymm3
; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm5, %xmm2
; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm3, %ymm3
; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512VLBW-NEXT: vpsllw %xmm2, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpsllw %xmm4, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
; AVX512VLBW-NEXT: vptestnmb %xmm4, %xmm4, %k1
; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
@ -2068,17 +2068,17 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
; AVX512VLVBMI2: # %bb.0:
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm4
; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLVBMI2-NEXT: vpsrlw %xmm5, %ymm3, %ymm3
; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm5, %xmm2
; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %ymm3, %ymm3
; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512VLVBMI2-NEXT: vpsllw %xmm2, %ymm0, %ymm0
; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %ymm0, %ymm0
; AVX512VLVBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
; AVX512VLVBMI2-NEXT: vptestnmb %xmm4, %xmm4, %k1
; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512VLVBMI2-NEXT: vptestnmb %xmm2, %xmm2, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
; AVX512VLVBMI2-NEXT: vzeroupper
; AVX512VLVBMI2-NEXT: retq