[X86] LowerRotate - recognise hidden ROTR patterns for better vXi8 codegen

Check for a hidden ISD::ROTR (rotl(sub(0,x))) - vXi8 lowering can handle both (its always beneficial for splats, but otherwise only if we have VPTERNLOG).

We currently hit infinite loops in TargetLowering::expandROT if we set ISD::ROTR to custom, which needs addressing before we extend this much further.
This commit is contained in:
Simon Pilgrim 2021-11-19 11:48:49 +00:00
parent 47eb3f155f
commit 0f652d8f52
3 changed files with 108 additions and 103 deletions

View File

@ -29854,20 +29854,30 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()))
return SDValue();
// Check for a hidden ISD::ROTR, vXi8 lowering can handle both, but we
// currently hit infinite loops in legalization if we allow ISD::ROTR.
// FIXME: Infinite ROTL<->ROTR legalization in TargetLowering::expandROT.
SDValue HiddenROTRAmt;
if (Amt.getOpcode() == ISD::SUB &&
ISD::isBuildVectorAllZeros(Amt.getOperand(0).getNode()))
HiddenROTRAmt = Amt.getOperand(1);
MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
// If the amount is a splat, attempt to fold as unpack(x,x) << zext(y):
// rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
if (SDValue BaseRotAmt =
DAG.getSplatValue(DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask))) {
// rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
if (SDValue BaseRotAmt = DAG.getSplatValue(DAG.getNode(
ISD::AND, DL, VT, HiddenROTRAmt ? HiddenROTRAmt : Amt, AmtMask))) {
unsigned ShiftX86Opc = HiddenROTRAmt ? X86ISD::VSRLI : X86ISD::VSHLI;
BaseRotAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseRotAmt);
SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
Lo = getTargetVShiftNode(X86ISD::VSHLI, DL, ExtVT, Lo, BaseRotAmt,
Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
Subtarget, DAG);
Hi = getTargetVShiftNode(X86ISD::VSHLI, DL, ExtVT, Hi, BaseRotAmt,
Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
Subtarget, DAG);
return getPack(DAG, Subtarget, DL, VT, Lo, Hi, /*PackHiHalf */ true);
return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !HiddenROTRAmt);
}
// We don't need ModuloAmt here as we just peek at individual bits.
@ -29889,6 +29899,15 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getSelect(DL, SelVT, C, V0, V1);
};
// 'Hidden' ROTR is currently only profitable on AVX512 targets where we
// have VPTERNLOG.
unsigned ShiftLHS = ISD::SHL;
unsigned ShiftRHS = ISD::SRL;
if (HiddenROTRAmt && useVPTERNLOG(Subtarget, VT)) {
std::swap(ShiftLHS, ShiftRHS);
Amt = HiddenROTRAmt;
}
// Turn 'a' into a mask suitable for VSELECT: a = a << 5;
// We can safely do this using i16 shifts as we're only interested in
// the 3 lower bits of each byte.
@ -29900,8 +29919,8 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
SDValue M;
M = DAG.getNode(
ISD::OR, DL, VT,
DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
R = SignBitSelect(VT, Amt, M, R);
// a += a
@ -29910,8 +29929,8 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
// r = VSELECT(r, rot(r, 2), a);
M = DAG.getNode(
ISD::OR, DL, VT,
DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
R = SignBitSelect(VT, Amt, M, R);
// a += a
@ -29920,8 +29939,8 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
// return VSELECT(r, rot(r, 1), a);
M = DAG.getNode(
ISD::OR, DL, VT,
DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
return SignBitSelect(VT, Amt, M, R);
}

View File

@ -1195,47 +1195,44 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
; SSE2-LABEL: splatvar_funnnel_v16i8:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: psubb %xmm1, %xmm2
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; SSE2-NEXT: psllw %xmm2, %xmm1
; SSE2-NEXT: psrlw $8, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE2-NEXT: psrlw %xmm1, %xmm2
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm3, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: psllw %xmm2, %xmm0
; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: packuswb %xmm1, %xmm0
; SSE2-NEXT: psrlw %xmm1, %xmm0
; SSE2-NEXT: pand %xmm3, %xmm0
; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_funnnel_v16i8:
; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: psubb %xmm1, %xmm2
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; SSE41-NEXT: psllw %xmm2, %xmm1
; SSE41-NEXT: psrlw $8, %xmm1
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
; SSE41-NEXT: psrlw %xmm1, %xmm2
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; SSE41-NEXT: pand %xmm3, %xmm2
; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE41-NEXT: psllw %xmm2, %xmm0
; SSE41-NEXT: psrlw $8, %xmm0
; SSE41-NEXT: packuswb %xmm1, %xmm0
; SSE41-NEXT: psrlw %xmm1, %xmm0
; SSE41-NEXT: pand %xmm3, %xmm0
; SSE41-NEXT: packuswb %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: splatvar_funnnel_v16i8:
; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpsubb %xmm1, %xmm2, %xmm1
; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX-NEXT: vpsllw %xmm1, %xmm2, %xmm2
; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; AVX-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpand %xmm3, %xmm0, %xmm0
; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX-NEXT: retq
;
@ -1349,19 +1346,18 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
;
; X86-SSE2-LABEL: splatvar_funnnel_v16i8:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pxor %xmm2, %xmm2
; X86-SSE2-NEXT: psubb %xmm1, %xmm2
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
; X86-SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
; X86-SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; X86-SSE2-NEXT: psllw %xmm2, %xmm1
; X86-SSE2-NEXT: psrlw $8, %xmm1
; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
; X86-SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X86-SSE2-NEXT: psrlw %xmm1, %xmm2
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; X86-SSE2-NEXT: pand %xmm3, %xmm2
; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE2-NEXT: psllw %xmm2, %xmm0
; X86-SSE2-NEXT: psrlw $8, %xmm0
; X86-SSE2-NEXT: packuswb %xmm1, %xmm0
; X86-SSE2-NEXT: psrlw %xmm1, %xmm0
; X86-SSE2-NEXT: pand %xmm3, %xmm0
; X86-SSE2-NEXT: packuswb %xmm2, %xmm0
; X86-SSE2-NEXT: retl
%splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
%res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> %splat)

View File

@ -490,43 +490,38 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
;
; AVX512F-LABEL: var_funnnel_v32i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3
; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpsubb %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm2
; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3
; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm2
; AVX512F-NEXT: vpsllw $6, %ymm0, %ymm3
; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3
; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm2
; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm3
; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v32i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3
; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3
; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VL-NEXT: vpsubb %ymm1, %ymm2, %ymm1
; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2
; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3
; AVX512VL-NEXT: vpsrlw $2, %ymm0, %ymm2
; AVX512VL-NEXT: vpsllw $6, %ymm0, %ymm3
; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3
; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2
; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3
; AVX512VL-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3
; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm2
; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm3
; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3
; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: retq
@ -975,70 +970,65 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
; AVX1-LABEL: splatvar_funnnel_v32i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpsllw %xmm1, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpsllw %xmm1, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_funnnel_v32i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX2-NEXT: vpsllw %xmm1, %ymm2, %ymm2
; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: splatvar_funnnel_v32i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpsubb %xmm1, %xmm2, %xmm1
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512F-NEXT: vpsllw %xmm1, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v32i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VL-NEXT: vpsubb %xmm1, %xmm2, %xmm1
; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VL-NEXT: vpsllw %xmm1, %ymm2, %ymm2
; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;