forked from OSchip/llvm-project
[X86] LowerRotate - recognise hidden ROTR patterns for better vXi8 codegen
Check for a hidden ISD::ROTR (rotl(sub(0,x))) - vXi8 lowering can handle both (its always beneficial for splats, but otherwise only if we have VPTERNLOG). We currently hit infinite loops in TargetLowering::expandROT if we set ISD::ROTR to custom, which needs addressing before we extend this much further.
This commit is contained in:
parent
47eb3f155f
commit
0f652d8f52
|
@ -29854,20 +29854,30 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
|
|||
if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()))
|
||||
return SDValue();
|
||||
|
||||
// Check for a hidden ISD::ROTR, vXi8 lowering can handle both, but we
|
||||
// currently hit infinite loops in legalization if we allow ISD::ROTR.
|
||||
// FIXME: Infinite ROTL<->ROTR legalization in TargetLowering::expandROT.
|
||||
SDValue HiddenROTRAmt;
|
||||
if (Amt.getOpcode() == ISD::SUB &&
|
||||
ISD::isBuildVectorAllZeros(Amt.getOperand(0).getNode()))
|
||||
HiddenROTRAmt = Amt.getOperand(1);
|
||||
|
||||
MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
|
||||
|
||||
// If the amount is a splat, attempt to fold as unpack(x,x) << zext(y):
|
||||
// rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
|
||||
if (SDValue BaseRotAmt =
|
||||
DAG.getSplatValue(DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask))) {
|
||||
// rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
|
||||
if (SDValue BaseRotAmt = DAG.getSplatValue(DAG.getNode(
|
||||
ISD::AND, DL, VT, HiddenROTRAmt ? HiddenROTRAmt : Amt, AmtMask))) {
|
||||
unsigned ShiftX86Opc = HiddenROTRAmt ? X86ISD::VSRLI : X86ISD::VSHLI;
|
||||
BaseRotAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseRotAmt);
|
||||
SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
|
||||
SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
|
||||
Lo = getTargetVShiftNode(X86ISD::VSHLI, DL, ExtVT, Lo, BaseRotAmt,
|
||||
Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
|
||||
Subtarget, DAG);
|
||||
Hi = getTargetVShiftNode(X86ISD::VSHLI, DL, ExtVT, Hi, BaseRotAmt,
|
||||
Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
|
||||
Subtarget, DAG);
|
||||
return getPack(DAG, Subtarget, DL, VT, Lo, Hi, /*PackHiHalf */ true);
|
||||
return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !HiddenROTRAmt);
|
||||
}
|
||||
|
||||
// We don't need ModuloAmt here as we just peek at individual bits.
|
||||
|
@ -29889,6 +29899,15 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
|
|||
return DAG.getSelect(DL, SelVT, C, V0, V1);
|
||||
};
|
||||
|
||||
// 'Hidden' ROTR is currently only profitable on AVX512 targets where we
|
||||
// have VPTERNLOG.
|
||||
unsigned ShiftLHS = ISD::SHL;
|
||||
unsigned ShiftRHS = ISD::SRL;
|
||||
if (HiddenROTRAmt && useVPTERNLOG(Subtarget, VT)) {
|
||||
std::swap(ShiftLHS, ShiftRHS);
|
||||
Amt = HiddenROTRAmt;
|
||||
}
|
||||
|
||||
// Turn 'a' into a mask suitable for VSELECT: a = a << 5;
|
||||
// We can safely do this using i16 shifts as we're only interested in
|
||||
// the 3 lower bits of each byte.
|
||||
|
@ -29900,8 +29919,8 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
|
|||
SDValue M;
|
||||
M = DAG.getNode(
|
||||
ISD::OR, DL, VT,
|
||||
DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
|
||||
DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
|
||||
DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
|
||||
DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
|
||||
R = SignBitSelect(VT, Amt, M, R);
|
||||
|
||||
// a += a
|
||||
|
@ -29910,8 +29929,8 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
|
|||
// r = VSELECT(r, rot(r, 2), a);
|
||||
M = DAG.getNode(
|
||||
ISD::OR, DL, VT,
|
||||
DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
|
||||
DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
|
||||
DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
|
||||
DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
|
||||
R = SignBitSelect(VT, Amt, M, R);
|
||||
|
||||
// a += a
|
||||
|
@ -29920,8 +29939,8 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
|
|||
// return VSELECT(r, rot(r, 1), a);
|
||||
M = DAG.getNode(
|
||||
ISD::OR, DL, VT,
|
||||
DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
|
||||
DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
|
||||
DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
|
||||
DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
|
||||
return SignBitSelect(VT, Amt, M, R);
|
||||
}
|
||||
|
||||
|
|
|
@ -1195,47 +1195,44 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
|
|||
define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
|
||||
; SSE2-LABEL: splatvar_funnnel_v16i8:
|
||||
; SSE2: # %bb.0:
|
||||
; SSE2-NEXT: pxor %xmm2, %xmm2
|
||||
; SSE2-NEXT: psubb %xmm1, %xmm2
|
||||
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
|
||||
; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
|
||||
; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm1
|
||||
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
|
||||
; SSE2-NEXT: psllw %xmm2, %xmm1
|
||||
; SSE2-NEXT: psrlw $8, %xmm1
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm2
|
||||
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
|
||||
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
|
||||
; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
|
||||
; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
|
||||
; SSE2-NEXT: psrlw %xmm1, %xmm2
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
|
||||
; SSE2-NEXT: pand %xmm3, %xmm2
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
||||
; SSE2-NEXT: psllw %xmm2, %xmm0
|
||||
; SSE2-NEXT: psrlw $8, %xmm0
|
||||
; SSE2-NEXT: packuswb %xmm1, %xmm0
|
||||
; SSE2-NEXT: psrlw %xmm1, %xmm0
|
||||
; SSE2-NEXT: pand %xmm3, %xmm0
|
||||
; SSE2-NEXT: packuswb %xmm2, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: splatvar_funnnel_v16i8:
|
||||
; SSE41: # %bb.0:
|
||||
; SSE41-NEXT: pxor %xmm2, %xmm2
|
||||
; SSE41-NEXT: psubb %xmm1, %xmm2
|
||||
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
|
||||
; SSE41-NEXT: movdqa %xmm0, %xmm1
|
||||
; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
|
||||
; SSE41-NEXT: psllw %xmm2, %xmm1
|
||||
; SSE41-NEXT: psrlw $8, %xmm1
|
||||
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
|
||||
; SSE41-NEXT: movdqa %xmm0, %xmm2
|
||||
; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
|
||||
; SSE41-NEXT: psrlw %xmm1, %xmm2
|
||||
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
|
||||
; SSE41-NEXT: pand %xmm3, %xmm2
|
||||
; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
||||
; SSE41-NEXT: psllw %xmm2, %xmm0
|
||||
; SSE41-NEXT: psrlw $8, %xmm0
|
||||
; SSE41-NEXT: packuswb %xmm1, %xmm0
|
||||
; SSE41-NEXT: psrlw %xmm1, %xmm0
|
||||
; SSE41-NEXT: pand %xmm3, %xmm0
|
||||
; SSE41-NEXT: packuswb %xmm2, %xmm0
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: splatvar_funnnel_v16i8:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; AVX-NEXT: vpsubb %xmm1, %xmm2, %xmm1
|
||||
; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
|
||||
; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
|
||||
; AVX-NEXT: vpsllw %xmm1, %xmm2, %xmm2
|
||||
; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2
|
||||
; AVX-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
|
||||
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
|
||||
; AVX-NEXT: vpand %xmm3, %xmm2, %xmm2
|
||||
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
||||
; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpand %xmm3, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
;
|
||||
|
@ -1349,19 +1346,18 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
|
|||
;
|
||||
; X86-SSE2-LABEL: splatvar_funnnel_v16i8:
|
||||
; X86-SSE2: # %bb.0:
|
||||
; X86-SSE2-NEXT: pxor %xmm2, %xmm2
|
||||
; X86-SSE2-NEXT: psubb %xmm1, %xmm2
|
||||
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
|
||||
; X86-SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
|
||||
; X86-SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
|
||||
; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
|
||||
; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
|
||||
; X86-SSE2-NEXT: psllw %xmm2, %xmm1
|
||||
; X86-SSE2-NEXT: psrlw $8, %xmm1
|
||||
; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
|
||||
; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
|
||||
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
|
||||
; X86-SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
|
||||
; X86-SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
|
||||
; X86-SSE2-NEXT: psrlw %xmm1, %xmm2
|
||||
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
|
||||
; X86-SSE2-NEXT: pand %xmm3, %xmm2
|
||||
; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
||||
; X86-SSE2-NEXT: psllw %xmm2, %xmm0
|
||||
; X86-SSE2-NEXT: psrlw $8, %xmm0
|
||||
; X86-SSE2-NEXT: packuswb %xmm1, %xmm0
|
||||
; X86-SSE2-NEXT: psrlw %xmm1, %xmm0
|
||||
; X86-SSE2-NEXT: pand %xmm3, %xmm0
|
||||
; X86-SSE2-NEXT: packuswb %xmm2, %xmm0
|
||||
; X86-SSE2-NEXT: retl
|
||||
%splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
|
||||
%res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> %splat)
|
||||
|
|
|
@ -490,43 +490,38 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
|
|||
;
|
||||
; AVX512F-LABEL: var_funnnel_v32i8:
|
||||
; AVX512F: # %bb.0:
|
||||
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
|
||||
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3
|
||||
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
|
||||
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3
|
||||
; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3
|
||||
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; AVX512F-NEXT: vpsubb %ymm1, %ymm2, %ymm1
|
||||
; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
|
||||
; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
|
||||
; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm2
|
||||
; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3
|
||||
; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm2
|
||||
; AVX512F-NEXT: vpsllw $6, %ymm0, %ymm3
|
||||
; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3
|
||||
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
|
||||
; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
|
||||
; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2
|
||||
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
|
||||
; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3
|
||||
; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2
|
||||
; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm2
|
||||
; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm3
|
||||
; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3
|
||||
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
|
||||
; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
|
||||
; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
; AVX512VL-LABEL: var_funnnel_v32i8:
|
||||
; AVX512VL: # %bb.0:
|
||||
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
|
||||
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
|
||||
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2
|
||||
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3
|
||||
; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3
|
||||
; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; AVX512VL-NEXT: vpsubb %ymm1, %ymm2, %ymm1
|
||||
; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
|
||||
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
|
||||
; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2
|
||||
; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3
|
||||
; AVX512VL-NEXT: vpsrlw $2, %ymm0, %ymm2
|
||||
; AVX512VL-NEXT: vpsllw $6, %ymm0, %ymm3
|
||||
; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3
|
||||
; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
|
||||
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
|
||||
; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2
|
||||
; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3
|
||||
; AVX512VL-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3
|
||||
; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm2
|
||||
; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm3
|
||||
; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3
|
||||
; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
|
||||
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
|
||||
; AVX512VL-NEXT: retq
|
||||
|
@ -975,70 +970,65 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
|
|||
define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
|
||||
; AVX1-LABEL: splatvar_funnnel_v32i8:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
|
||||
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
||||
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
|
||||
; AVX1-NEXT: vpsllw %xmm1, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
|
||||
; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
||||
; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
|
||||
; AVX1-NEXT: vpsllw %xmm1, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
||||
; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: splatvar_funnnel_v32i8:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; AVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
|
||||
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
|
||||
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
|
||||
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
|
||||
; AVX2-NEXT: vpsllw %xmm1, %ymm2, %ymm2
|
||||
; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
|
||||
; AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
|
||||
; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
|
||||
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
|
||||
; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512F-LABEL: splatvar_funnnel_v32i8:
|
||||
; AVX512F: # %bb.0:
|
||||
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; AVX512F-NEXT: vpsubb %xmm1, %xmm2, %xmm1
|
||||
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
|
||||
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
|
||||
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
|
||||
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
|
||||
; AVX512F-NEXT: vpsllw %xmm1, %ymm2, %ymm2
|
||||
; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
|
||||
; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
|
||||
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
|
||||
; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
|
||||
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
|
||||
; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0
|
||||
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
|
||||
; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
|
||||
; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
|
||||
; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
; AVX512VL-LABEL: splatvar_funnnel_v32i8:
|
||||
; AVX512VL: # %bb.0:
|
||||
; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; AVX512VL-NEXT: vpsubb %xmm1, %xmm2, %xmm1
|
||||
; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
|
||||
; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
|
||||
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
|
||||
; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
|
||||
; AVX512VL-NEXT: vpsllw %xmm1, %ymm2, %ymm2
|
||||
; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
|
||||
; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
|
||||
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
|
||||
; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
|
||||
; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
|
||||
; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0
|
||||
; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
|
||||
; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
|
||||
; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0
|
||||
; AVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
|
||||
; AVX512VL-NEXT: retq
|
||||
;
|
||||
|
|
Loading…
Reference in New Issue