[X86][SSE] lowerShuffleAsBitRotate - lower to vXi8 shuffles to ROTL on pre-SSSE3 targets

Without PSHUFB we are better using ROTL (expanding to OR(SHL,SRL)) than using the generic v16i8 shuffle lowering - but if we can widen to v8i16 or more then the existing shuffles are still the better option.

REAPPLIED: Original commit rG11c16e71598d was reverted at rGde1d90299b16 as it wasn't accounting for later lowering. This version emits ROTLI or the OR(VSHLI/VSRLI) directly to avoid the issue.
This commit is contained in:
Simon Pilgrim 2020-02-14 11:54:55 +00:00
parent de1c2877a9
commit 2492075add
5 changed files with 157 additions and 201 deletions

View File

@ -11704,7 +11704,7 @@ static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
return RotateAmt;
}
/// Lower shuffle using ISD::ROTL rotations.
/// Lower shuffle using X86ISD::VROTLI rotations.
static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
ArrayRef<int> Mask,
const X86Subtarget &Subtarget,
@ -11716,25 +11716,46 @@ static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
// Only XOP + AVX512 targets have bit rotation instructions.
// If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
bool IsLegal =
(VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
if (!IsLegal)
if (!IsLegal && Subtarget.hasSSE3())
return SDValue();
// AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
int MinSubElts = Subtarget.hasXOP() ? 2 : std::max(32 / EltSizeInBits, 2);
int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
int MaxSubElts = 64 / EltSizeInBits;
for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
if (RotateAmt < 0)
continue;
int RotateAmtInBits = RotateAmt * EltSizeInBits;
int NumElts = VT.getVectorNumElements();
MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
MVT RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
// For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
// expanded to OR(SRL,SHL), will be more efficient, but if they can
// widen to vXi16 or more then existing lowering should will be better.
int RotateAmtInBits = RotateAmt * EltSizeInBits;
if (!IsLegal) {
if ((RotateAmtInBits % 16) == 0)
return SDValue();
// TODO: Use getTargetVShiftByConstNode.
unsigned ShlAmt = RotateAmtInBits;
unsigned SrlAmt = RotateSVT.getScalarSizeInBits() - RotateAmtInBits;
V1 = DAG.getBitcast(RotateVT, V1);
SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
return DAG.getBitcast(VT, Rot);
}
SDValue Rot =
DAG.getNode(ISD::ROTL, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
DAG.getConstant(RotateAmtInBits, DL, RotateVT));
DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
DAG.getTargetConstant(RotateAmtInBits, DL, MVT::i8));
return DAG.getBitcast(VT, Rot);
}

View File

@ -52,15 +52,10 @@ define <2 x i16> @test_bitreverse_v2i16(<2 x i16> %a) nounwind {
;
; X64-LABEL: test_bitreverse_v2i16:
; X64: # %bb.0:
; X64-NEXT: pxor %xmm1, %xmm1
; X64-NEXT: movdqa %xmm0, %xmm2
; X64-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; X64-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
; X64-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
; X64-NEXT: packuswb %xmm2, %xmm0
; X64-NEXT: movdqa %xmm0, %xmm1
; X64-NEXT: psrlw $8, %xmm1
; X64-NEXT: psllw $8, %xmm0
; X64-NEXT: por %xmm1, %xmm0
; X64-NEXT: movdqa %xmm0, %xmm1
; X64-NEXT: psllw $4, %xmm1
; X64-NEXT: pand {{.*}}(%rip), %xmm1

View File

@ -11,15 +11,10 @@ declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>)
define <8 x i16> @test1(<8 x i16> %v) {
; CHECK-NOSSSE3-LABEL: test1:
; CHECK-NOSSSE3: # %bb.0: # %entry
; CHECK-NOSSSE3-NEXT: pxor %xmm1, %xmm1
; CHECK-NOSSSE3-NEXT: movdqa %xmm0, %xmm2
; CHECK-NOSSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; CHECK-NOSSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
; CHECK-NOSSSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
; CHECK-NOSSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; CHECK-NOSSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
; CHECK-NOSSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
; CHECK-NOSSSE3-NEXT: packuswb %xmm2, %xmm0
; CHECK-NOSSSE3-NEXT: movdqa %xmm0, %xmm1
; CHECK-NOSSSE3-NEXT: psrlw $8, %xmm1
; CHECK-NOSSSE3-NEXT: psllw $8, %xmm0
; CHECK-NOSSSE3-NEXT: por %xmm1, %xmm0
; CHECK-NOSSSE3-NEXT: ret{{[l|q]}}
;
; CHECK-SSSE3-LABEL: test1:
@ -132,23 +127,14 @@ declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>)
define <16 x i16> @test4(<16 x i16> %v) {
; CHECK-NOSSSE3-LABEL: test4:
; CHECK-NOSSSE3: # %bb.0: # %entry
; CHECK-NOSSSE3-NEXT: pxor %xmm2, %xmm2
; CHECK-NOSSSE3-NEXT: movdqa %xmm0, %xmm3
; CHECK-NOSSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
; CHECK-NOSSSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
; CHECK-NOSSSE3-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,6]
; CHECK-NOSSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; CHECK-NOSSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
; CHECK-NOSSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
; CHECK-NOSSSE3-NEXT: packuswb %xmm3, %xmm0
; CHECK-NOSSSE3-NEXT: movdqa %xmm1, %xmm3
; CHECK-NOSSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
; CHECK-NOSSSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
; CHECK-NOSSSE3-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,6]
; CHECK-NOSSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; CHECK-NOSSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
; CHECK-NOSSSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6]
; CHECK-NOSSSE3-NEXT: packuswb %xmm3, %xmm1
; CHECK-NOSSSE3-NEXT: movdqa %xmm0, %xmm2
; CHECK-NOSSSE3-NEXT: psrlw $8, %xmm2
; CHECK-NOSSSE3-NEXT: psllw $8, %xmm0
; CHECK-NOSSSE3-NEXT: por %xmm2, %xmm0
; CHECK-NOSSSE3-NEXT: movdqa %xmm1, %xmm2
; CHECK-NOSSSE3-NEXT: psrlw $8, %xmm2
; CHECK-NOSSSE3-NEXT: psllw $8, %xmm1
; CHECK-NOSSSE3-NEXT: por %xmm2, %xmm1
; CHECK-NOSSSE3-NEXT: ret{{[l|q]}}
;
; CHECK-SSSE3-LABEL: test4:
@ -252,15 +238,10 @@ declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>)
define <4 x i16> @test7(<4 x i16> %v) {
; CHECK-NOSSSE3-LABEL: test7:
; CHECK-NOSSSE3: # %bb.0: # %entry
; CHECK-NOSSSE3-NEXT: pxor %xmm1, %xmm1
; CHECK-NOSSSE3-NEXT: movdqa %xmm0, %xmm2
; CHECK-NOSSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; CHECK-NOSSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
; CHECK-NOSSSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
; CHECK-NOSSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; CHECK-NOSSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
; CHECK-NOSSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
; CHECK-NOSSSE3-NEXT: packuswb %xmm2, %xmm0
; CHECK-NOSSSE3-NEXT: movdqa %xmm0, %xmm1
; CHECK-NOSSSE3-NEXT: psrlw $8, %xmm1
; CHECK-NOSSSE3-NEXT: psllw $8, %xmm0
; CHECK-NOSSSE3-NEXT: por %xmm1, %xmm0
; CHECK-NOSSSE3-NEXT: ret{{[l|q]}}
;
; CHECK-SSSE3-LABEL: test7:

View File

@ -295,15 +295,10 @@ define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind {
define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
; SSE2-LABEL: test_bitreverse_v8i16:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $8, %xmm1
; SSE2-NEXT: psllw $8, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psllw $4, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
@ -647,63 +642,54 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
; SSE2-LABEL: test_bitreverse_v16i16:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
; SSE2-NEXT: packuswb %xmm1, %xmm0
; SSE2-NEXT: psrlw $8, %xmm1
; SSE2-NEXT: psllw $8, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: psllw $4, %xmm3
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE2-NEXT: movdqa %xmm1, %xmm5
; SSE2-NEXT: pandn %xmm3, %xmm5
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pandn %xmm3, %xmm4
; SSE2-NEXT: psrlw $4, %xmm0
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: por %xmm5, %xmm0
; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: pand %xmm3, %xmm5
; SSE2-NEXT: psllw $2, %xmm5
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
; SSE2-NEXT: pand %xmm8, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: pand %xmm3, %xmm4
; SSE2-NEXT: psllw $2, %xmm4
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
; SSE2-NEXT: pand %xmm5, %xmm0
; SSE2-NEXT: psrlw $2, %xmm0
; SSE2-NEXT: por %xmm5, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
; SSE2-NEXT: movdqa %xmm0, %xmm6
; SSE2-NEXT: pand %xmm5, %xmm6
; SSE2-NEXT: paddb %xmm6, %xmm6
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
; SSE2-NEXT: pand %xmm7, %xmm0
; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
; SSE2-NEXT: movdqa %xmm0, %xmm7
; SSE2-NEXT: pand %xmm4, %xmm7
; SSE2-NEXT: paddb %xmm7, %xmm7
; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
; SSE2-NEXT: pand %xmm6, %xmm0
; SSE2-NEXT: psrlw $1, %xmm0
; SSE2-NEXT: por %xmm6, %xmm0
; SSE2-NEXT: movdqa %xmm2, %xmm6
; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,7,6]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
; SSE2-NEXT: packuswb %xmm6, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: psllw $4, %xmm4
; SSE2-NEXT: por %xmm7, %xmm0
; SSE2-NEXT: movdqa %xmm2, %xmm7
; SSE2-NEXT: psrlw $8, %xmm7
; SSE2-NEXT: psllw $8, %xmm2
; SSE2-NEXT: por %xmm7, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm7
; SSE2-NEXT: psllw $4, %xmm7
; SSE2-NEXT: psrlw $4, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: pandn %xmm4, %xmm1
; SSE2-NEXT: pandn %xmm7, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm3
; SSE2-NEXT: psllw $2, %xmm3
; SSE2-NEXT: pand %xmm8, %xmm1
; SSE2-NEXT: pand %xmm5, %xmm1
; SSE2-NEXT: psrlw $2, %xmm1
; SSE2-NEXT: por %xmm3, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm5
; SSE2-NEXT: paddb %xmm5, %xmm5
; SSE2-NEXT: pand %xmm7, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm4
; SSE2-NEXT: paddb %xmm4, %xmm4
; SSE2-NEXT: pand %xmm6, %xmm1
; SSE2-NEXT: psrlw $1, %xmm1
; SSE2-NEXT: por %xmm5, %xmm1
; SSE2-NEXT: por %xmm4, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_bitreverse_v16i16:
@ -1387,118 +1373,101 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
; SSE2-LABEL: test_bitreverse_v32i16:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm3, %xmm11
; SSE2-NEXT: pxor %xmm10, %xmm10
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,6]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
; SSE2-NEXT: packuswb %xmm3, %xmm0
; SSE2-NEXT: psrlw $8, %xmm3
; SSE2-NEXT: psllw $8, %xmm0
; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: psllw $4, %xmm5
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE2-NEXT: movdqa %xmm3, %xmm7
; SSE2-NEXT: pandn %xmm5, %xmm7
; SSE2-NEXT: movdqa %xmm3, %xmm6
; SSE2-NEXT: pandn %xmm5, %xmm6
; SSE2-NEXT: psrlw $4, %xmm0
; SSE2-NEXT: pand %xmm3, %xmm0
; SSE2-NEXT: por %xmm7, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm0, %xmm7
; SSE2-NEXT: pand %xmm5, %xmm7
; SSE2-NEXT: psllw $2, %xmm7
; SSE2-NEXT: por %xmm6, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm0, %xmm6
; SSE2-NEXT: pand %xmm10, %xmm6
; SSE2-NEXT: psllw $2, %xmm6
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
; SSE2-NEXT: pand %xmm8, %xmm0
; SSE2-NEXT: psrlw $2, %xmm0
; SSE2-NEXT: por %xmm7, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
; SSE2-NEXT: movdqa %xmm0, %xmm6
; SSE2-NEXT: pand %xmm7, %xmm6
; SSE2-NEXT: paddb %xmm6, %xmm6
; SSE2-NEXT: por %xmm6, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
; SSE2-NEXT: movdqa %xmm0, %xmm7
; SSE2-NEXT: pand %xmm6, %xmm7
; SSE2-NEXT: paddb %xmm7, %xmm7
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
; SSE2-NEXT: pand %xmm9, %xmm0
; SSE2-NEXT: psrlw $1, %xmm0
; SSE2-NEXT: por %xmm6, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm6
; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15]
; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,7,6]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6]
; SSE2-NEXT: packuswb %xmm6, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm6
; SSE2-NEXT: psllw $4, %xmm6
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pandn %xmm6, %xmm4
; SSE2-NEXT: por %xmm7, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm7
; SSE2-NEXT: psrlw $8, %xmm7
; SSE2-NEXT: psllw $8, %xmm1
; SSE2-NEXT: por %xmm7, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm7
; SSE2-NEXT: psllw $4, %xmm7
; SSE2-NEXT: movdqa %xmm3, %xmm5
; SSE2-NEXT: pandn %xmm7, %xmm5
; SSE2-NEXT: psrlw $4, %xmm1
; SSE2-NEXT: pand %xmm3, %xmm1
; SSE2-NEXT: por %xmm4, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pand %xmm5, %xmm4
; SSE2-NEXT: psllw $2, %xmm4
; SSE2-NEXT: por %xmm5, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm5
; SSE2-NEXT: pand %xmm10, %xmm5
; SSE2-NEXT: psllw $2, %xmm5
; SSE2-NEXT: pand %xmm8, %xmm1
; SSE2-NEXT: psrlw $2, %xmm1
; SSE2-NEXT: por %xmm4, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pand %xmm7, %xmm4
; SSE2-NEXT: paddb %xmm4, %xmm4
; SSE2-NEXT: por %xmm5, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm5
; SSE2-NEXT: pand %xmm6, %xmm5
; SSE2-NEXT: paddb %xmm5, %xmm5
; SSE2-NEXT: pand %xmm9, %xmm1
; SSE2-NEXT: psrlw $1, %xmm1
; SSE2-NEXT: por %xmm4, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
; SSE2-NEXT: packuswb %xmm4, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: psllw $4, %xmm4
; SSE2-NEXT: movdqa %xmm3, %xmm6
; SSE2-NEXT: pandn %xmm4, %xmm6
; SSE2-NEXT: por %xmm5, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm5
; SSE2-NEXT: psrlw $8, %xmm5
; SSE2-NEXT: psllw $8, %xmm2
; SSE2-NEXT: por %xmm5, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm5
; SSE2-NEXT: psllw $4, %xmm5
; SSE2-NEXT: movdqa %xmm3, %xmm7
; SSE2-NEXT: pandn %xmm5, %xmm7
; SSE2-NEXT: psrlw $4, %xmm2
; SSE2-NEXT: pand %xmm3, %xmm2
; SSE2-NEXT: por %xmm6, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: pand %xmm5, %xmm4
; SSE2-NEXT: psllw $2, %xmm4
; SSE2-NEXT: por %xmm7, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm5
; SSE2-NEXT: pand %xmm10, %xmm5
; SSE2-NEXT: psllw $2, %xmm5
; SSE2-NEXT: pand %xmm8, %xmm2
; SSE2-NEXT: psrlw $2, %xmm2
; SSE2-NEXT: por %xmm4, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: pand %xmm7, %xmm4
; SSE2-NEXT: paddb %xmm4, %xmm4
; SSE2-NEXT: por %xmm5, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm5
; SSE2-NEXT: pand %xmm6, %xmm5
; SSE2-NEXT: paddb %xmm5, %xmm5
; SSE2-NEXT: pand %xmm9, %xmm2
; SSE2-NEXT: psrlw $1, %xmm2
; SSE2-NEXT: por %xmm4, %xmm2
; SSE2-NEXT: movdqa %xmm11, %xmm4
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[1,0,3,2,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,7,6]
; SSE2-NEXT: packuswb %xmm4, %xmm6
; SSE2-NEXT: movdqa %xmm6, %xmm4
; SSE2-NEXT: psllw $4, %xmm4
; SSE2-NEXT: psrlw $4, %xmm6
; SSE2-NEXT: pand %xmm3, %xmm6
; SSE2-NEXT: pandn %xmm4, %xmm3
; SSE2-NEXT: por %xmm6, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm5
; SSE2-NEXT: psllw $2, %xmm5
; SSE2-NEXT: por %xmm5, %xmm2
; SSE2-NEXT: movdqa %xmm4, %xmm5
; SSE2-NEXT: psrlw $8, %xmm5
; SSE2-NEXT: psllw $8, %xmm4
; SSE2-NEXT: por %xmm5, %xmm4
; SSE2-NEXT: movdqa %xmm4, %xmm5
; SSE2-NEXT: psllw $4, %xmm5
; SSE2-NEXT: psrlw $4, %xmm4
; SSE2-NEXT: pand %xmm3, %xmm4
; SSE2-NEXT: pandn %xmm5, %xmm3
; SSE2-NEXT: por %xmm4, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm10
; SSE2-NEXT: psllw $2, %xmm10
; SSE2-NEXT: pand %xmm8, %xmm3
; SSE2-NEXT: psrlw $2, %xmm3
; SSE2-NEXT: por %xmm5, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm7
; SSE2-NEXT: paddb %xmm7, %xmm7
; SSE2-NEXT: por %xmm10, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm6
; SSE2-NEXT: paddb %xmm6, %xmm6
; SSE2-NEXT: pand %xmm9, %xmm3
; SSE2-NEXT: psrlw $1, %xmm3
; SSE2-NEXT: por %xmm7, %xmm3
; SSE2-NEXT: por %xmm6, %xmm3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_bitreverse_v32i16:

View File

@ -458,15 +458,10 @@ define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12(
define <16 x i8> @shuffle_v16i8_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: shuffle_v16i8_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $8, %xmm1
; SSE2-NEXT: psllw $8, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14:
@ -1891,15 +1886,10 @@ define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06(
define <16 x i8> @shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14(<16 x i8> %a) {
; SSE2-LABEL: shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,4,5,6]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,0,1,2,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,5,6]
; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrld $24, %xmm1
; SSE2-NEXT: pslld $8, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14: