forked from OSchip/llvm-project
[X86][SSE] Add initial support for combining target shuffles to (V)PMOVZX.
We can only handle 128-bit vectors until we support target shuffle inputs of different size to the output. llvm-svn: 288140
This commit is contained in:
parent
da22b3cb8a
commit
35c47c494d
|
@ -25445,7 +25445,7 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
|
|||
// TODO: Investigate sharing more of this with shuffle lowering.
|
||||
static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
|
||||
const X86Subtarget &Subtarget,
|
||||
unsigned &Shuffle, MVT &ShuffleVT) {
|
||||
unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
|
||||
unsigned NumMaskElts = Mask.size();
|
||||
unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
|
||||
bool FloatDomain = MaskVT.isFloatingPoint() ||
|
||||
|
@ -25456,27 +25456,48 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
|
|||
isUndefOrEqual(Mask[0], 0) &&
|
||||
isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
|
||||
Shuffle = X86ISD::VZEXT_MOVL;
|
||||
ShuffleVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
|
||||
SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Match against a VZEXT instruction.
|
||||
// TODO: Add 256/512-bit vector support.
|
||||
if (!FloatDomain && MaskVT.is128BitVector() && Subtarget.hasSSE41()) {
|
||||
unsigned MaxScale = 64 / MaskEltSize;
|
||||
for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
|
||||
bool Match = true;
|
||||
unsigned NumDstElts = NumMaskElts / Scale;
|
||||
for (unsigned i = 0; i != NumDstElts && Match; ++i) {
|
||||
Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
|
||||
Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
|
||||
}
|
||||
if (Match) {
|
||||
SrcVT = MaskVT;
|
||||
DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
|
||||
DstVT = MVT::getVectorVT(DstVT, NumDstElts);
|
||||
Shuffle = X86ISD::VZEXT;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check if we have SSE3 which will let us use MOVDDUP etc. The
|
||||
// instructions are no slower than UNPCKLPD but has the option to
|
||||
// fold the input operand into even an unaligned memory load.
|
||||
if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) {
|
||||
if (isTargetShuffleEquivalent(Mask, {0, 0})) {
|
||||
Shuffle = X86ISD::MOVDDUP;
|
||||
ShuffleVT = MVT::v2f64;
|
||||
SrcVT = DstVT = MVT::v2f64;
|
||||
return true;
|
||||
}
|
||||
if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
|
||||
Shuffle = X86ISD::MOVSLDUP;
|
||||
ShuffleVT = MVT::v4f32;
|
||||
SrcVT = DstVT = MVT::v4f32;
|
||||
return true;
|
||||
}
|
||||
if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
|
||||
Shuffle = X86ISD::MOVSHDUP;
|
||||
ShuffleVT = MVT::v4f32;
|
||||
SrcVT = DstVT = MVT::v4f32;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -25485,17 +25506,17 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
|
|||
assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
|
||||
if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
|
||||
Shuffle = X86ISD::MOVDDUP;
|
||||
ShuffleVT = MVT::v4f64;
|
||||
SrcVT = DstVT = MVT::v4f64;
|
||||
return true;
|
||||
}
|
||||
if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
|
||||
Shuffle = X86ISD::MOVSLDUP;
|
||||
ShuffleVT = MVT::v8f32;
|
||||
SrcVT = DstVT = MVT::v8f32;
|
||||
return true;
|
||||
}
|
||||
if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
|
||||
Shuffle = X86ISD::MOVSHDUP;
|
||||
ShuffleVT = MVT::v8f32;
|
||||
SrcVT = DstVT = MVT::v8f32;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -25505,19 +25526,19 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
|
|||
"AVX512 required for 512-bit vector shuffles");
|
||||
if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
|
||||
Shuffle = X86ISD::MOVDDUP;
|
||||
ShuffleVT = MVT::v8f64;
|
||||
SrcVT = DstVT = MVT::v8f64;
|
||||
return true;
|
||||
}
|
||||
if (isTargetShuffleEquivalent(
|
||||
Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
|
||||
Shuffle = X86ISD::MOVSLDUP;
|
||||
ShuffleVT = MVT::v16f32;
|
||||
SrcVT = DstVT = MVT::v16f32;
|
||||
return true;
|
||||
}
|
||||
if (isTargetShuffleEquivalent(
|
||||
Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
|
||||
Shuffle = X86ISD::MOVSHDUP;
|
||||
ShuffleVT = MVT::v16f32;
|
||||
SrcVT = DstVT = MVT::v16f32;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -25526,7 +25547,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
|
|||
if (Subtarget.hasAVX2()) {
|
||||
SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
|
||||
if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
|
||||
ShuffleVT = MaskVT;
|
||||
SrcVT = DstVT = MaskVT;
|
||||
Shuffle = X86ISD::VBROADCAST;
|
||||
return true;
|
||||
}
|
||||
|
@ -25954,7 +25975,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
|
|||
MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
|
||||
|
||||
// Attempt to match the mask against known shuffle patterns.
|
||||
MVT ShuffleVT;
|
||||
MVT ShuffleSrcVT, ShuffleVT;
|
||||
unsigned Shuffle, PermuteImm;
|
||||
|
||||
if (UnaryShuffle) {
|
||||
|
@ -25973,12 +25994,13 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
|
|||
}
|
||||
}
|
||||
|
||||
if (matchUnaryVectorShuffle(MaskVT, Mask, Subtarget, Shuffle, ShuffleVT)) {
|
||||
if (matchUnaryVectorShuffle(MaskVT, Mask, Subtarget, Shuffle, ShuffleSrcVT,
|
||||
ShuffleVT)) {
|
||||
if (Depth == 1 && Root.getOpcode() == Shuffle)
|
||||
return false; // Nothing to do!
|
||||
if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
|
||||
return false; // AVX512 Writemask clash.
|
||||
Res = DAG.getBitcast(ShuffleVT, V1);
|
||||
Res = DAG.getBitcast(ShuffleSrcVT, V1);
|
||||
DCI.AddToWorklist(Res.getNode());
|
||||
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
|
||||
DCI.AddToWorklist(Res.getNode());
|
||||
|
|
|
@ -175,24 +175,22 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
|
|||
;
|
||||
; SSE41-LABEL: testv4i32:
|
||||
; SSE41: # BB#0:
|
||||
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
||||
; SSE41-NEXT: movdqa %xmm0, %xmm3
|
||||
; SSE41-NEXT: pand %xmm2, %xmm3
|
||||
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
||||
; SSE41-NEXT: movdqa %xmm1, %xmm4
|
||||
; SSE41-NEXT: pshufb %xmm3, %xmm4
|
||||
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
||||
; SSE41-NEXT: movdqa %xmm0, %xmm2
|
||||
; SSE41-NEXT: pand %xmm1, %xmm2
|
||||
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
||||
; SSE41-NEXT: movdqa %xmm3, %xmm4
|
||||
; SSE41-NEXT: pshufb %xmm2, %xmm4
|
||||
; SSE41-NEXT: psrlw $4, %xmm0
|
||||
; SSE41-NEXT: pand %xmm2, %xmm0
|
||||
; SSE41-NEXT: pshufb %xmm0, %xmm1
|
||||
; SSE41-NEXT: paddb %xmm4, %xmm1
|
||||
; SSE41-NEXT: pxor %xmm0, %xmm0
|
||||
; SSE41-NEXT: movdqa %xmm1, %xmm2
|
||||
; SSE41-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
|
||||
; SSE41-NEXT: psadbw %xmm0, %xmm2
|
||||
; SSE41-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
||||
; SSE41-NEXT: psadbw %xmm0, %xmm1
|
||||
; SSE41-NEXT: packuswb %xmm2, %xmm1
|
||||
; SSE41-NEXT: movdqa %xmm1, %xmm0
|
||||
; SSE41-NEXT: pand %xmm1, %xmm0
|
||||
; SSE41-NEXT: pshufb %xmm0, %xmm3
|
||||
; SSE41-NEXT: paddb %xmm4, %xmm3
|
||||
; SSE41-NEXT: pxor %xmm1, %xmm1
|
||||
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero
|
||||
; SSE41-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
|
||||
; SSE41-NEXT: psadbw %xmm1, %xmm3
|
||||
; SSE41-NEXT: psadbw %xmm1, %xmm0
|
||||
; SSE41-NEXT: packuswb %xmm3, %xmm0
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: testv4i32:
|
||||
|
@ -208,7 +206,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
|
|||
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
|
||||
; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
|
||||
; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
|
|
|
@ -58,7 +58,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
|
|||
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
|
||||
; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
|
||||
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
|
||||
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
|
||||
; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm5
|
||||
|
@ -69,7 +69,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
|
|||
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
|
||||
; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
|
||||
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
|
||||
; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
|
|
|
@ -106,14 +106,19 @@ define <4 x float> @combine_pshufb_as_movss(<4 x float> %a0, <4 x float> %a1) {
|
|||
}
|
||||
|
||||
define <4 x i32> @combine_pshufb_as_zext(<16 x i8> %a0) {
|
||||
; SSE-LABEL: combine_pshufb_as_zext:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
||||
; SSE-NEXT: retq
|
||||
; SSSE3-LABEL: combine_pshufb_as_zext:
|
||||
; SSSE3: # BB#0:
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: combine_pshufb_as_zext:
|
||||
; SSE41: # BB#0:
|
||||
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: combine_pshufb_as_zext:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
||||
; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
||||
; AVX-NEXT: retq
|
||||
%1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 2, i8 -1, i8 -1, i8 -1, i8 3, i8 -1, i8 -1, i8 -1>)
|
||||
%2 = bitcast <16 x i8> %1 to <4 x i32>
|
||||
|
|
|
@ -448,22 +448,21 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
|
|||
; SSE41-NEXT: psubd %xmm0, %xmm2
|
||||
; SSE41-NEXT: pand %xmm0, %xmm2
|
||||
; SSE41-NEXT: psubd {{.*}}(%rip), %xmm2
|
||||
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
||||
; SSE41-NEXT: movdqa %xmm2, %xmm4
|
||||
; SSE41-NEXT: pand %xmm3, %xmm4
|
||||
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
||||
; SSE41-NEXT: movdqa %xmm0, %xmm5
|
||||
; SSE41-NEXT: pshufb %xmm4, %xmm5
|
||||
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
||||
; SSE41-NEXT: movdqa %xmm2, %xmm3
|
||||
; SSE41-NEXT: pand %xmm0, %xmm3
|
||||
; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
||||
; SSE41-NEXT: movdqa %xmm4, %xmm5
|
||||
; SSE41-NEXT: pshufb %xmm3, %xmm5
|
||||
; SSE41-NEXT: psrlw $4, %xmm2
|
||||
; SSE41-NEXT: pand %xmm3, %xmm2
|
||||
; SSE41-NEXT: pshufb %xmm2, %xmm0
|
||||
; SSE41-NEXT: paddb %xmm5, %xmm0
|
||||
; SSE41-NEXT: movdqa %xmm0, %xmm2
|
||||
; SSE41-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
|
||||
; SSE41-NEXT: psadbw %xmm1, %xmm2
|
||||
; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; SSE41-NEXT: pand %xmm0, %xmm2
|
||||
; SSE41-NEXT: pshufb %xmm2, %xmm4
|
||||
; SSE41-NEXT: paddb %xmm5, %xmm4
|
||||
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero
|
||||
; SSE41-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
|
||||
; SSE41-NEXT: psadbw %xmm1, %xmm4
|
||||
; SSE41-NEXT: psadbw %xmm1, %xmm0
|
||||
; SSE41-NEXT: packuswb %xmm2, %xmm0
|
||||
; SSE41-NEXT: packuswb %xmm4, %xmm0
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX1-LABEL: testv4i32:
|
||||
|
@ -482,7 +481,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
|
|||
; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
|
||||
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
||||
; AVX1-NEXT: retq
|
||||
|
@ -504,7 +503,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
|
|||
; AVX2-NEXT: vpaddb %xmm3, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
|
||||
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
||||
; AVX2-NEXT: retq
|
||||
|
@ -547,7 +546,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
|
|||
; AVX512CD-NEXT: vpaddb %xmm3, %xmm0, %xmm0
|
||||
; AVX512CD-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX512CD-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
|
||||
; AVX512CD-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX512CD-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
|
||||
; AVX512CD-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
|
||||
; AVX512CD-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
||||
; AVX512CD-NEXT: retq
|
||||
|
@ -559,22 +558,21 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
|
|||
; X32-SSE-NEXT: psubd %xmm0, %xmm2
|
||||
; X32-SSE-NEXT: pand %xmm0, %xmm2
|
||||
; X32-SSE-NEXT: psubd {{\.LCPI.*}}, %xmm2
|
||||
; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
||||
; X32-SSE-NEXT: movdqa %xmm2, %xmm4
|
||||
; X32-SSE-NEXT: pand %xmm3, %xmm4
|
||||
; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
||||
; X32-SSE-NEXT: movdqa %xmm0, %xmm5
|
||||
; X32-SSE-NEXT: pshufb %xmm4, %xmm5
|
||||
; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
||||
; X32-SSE-NEXT: movdqa %xmm2, %xmm3
|
||||
; X32-SSE-NEXT: pand %xmm0, %xmm3
|
||||
; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
||||
; X32-SSE-NEXT: movdqa %xmm4, %xmm5
|
||||
; X32-SSE-NEXT: pshufb %xmm3, %xmm5
|
||||
; X32-SSE-NEXT: psrlw $4, %xmm2
|
||||
; X32-SSE-NEXT: pand %xmm3, %xmm2
|
||||
; X32-SSE-NEXT: pshufb %xmm2, %xmm0
|
||||
; X32-SSE-NEXT: paddb %xmm5, %xmm0
|
||||
; X32-SSE-NEXT: movdqa %xmm0, %xmm2
|
||||
; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
|
||||
; X32-SSE-NEXT: psadbw %xmm1, %xmm2
|
||||
; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; X32-SSE-NEXT: pand %xmm0, %xmm2
|
||||
; X32-SSE-NEXT: pshufb %xmm2, %xmm4
|
||||
; X32-SSE-NEXT: paddb %xmm5, %xmm4
|
||||
; X32-SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero
|
||||
; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
|
||||
; X32-SSE-NEXT: psadbw %xmm1, %xmm4
|
||||
; X32-SSE-NEXT: psadbw %xmm1, %xmm0
|
||||
; X32-SSE-NEXT: packuswb %xmm2, %xmm0
|
||||
; X32-SSE-NEXT: packuswb %xmm4, %xmm0
|
||||
; X32-SSE-NEXT: retl
|
||||
%out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 0)
|
||||
ret <4 x i32> %out
|
||||
|
@ -671,22 +669,21 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
|
|||
; SSE41-NEXT: psubd %xmm0, %xmm2
|
||||
; SSE41-NEXT: pand %xmm0, %xmm2
|
||||
; SSE41-NEXT: psubd {{.*}}(%rip), %xmm2
|
||||
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
||||
; SSE41-NEXT: movdqa %xmm2, %xmm4
|
||||
; SSE41-NEXT: pand %xmm3, %xmm4
|
||||
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
||||
; SSE41-NEXT: movdqa %xmm0, %xmm5
|
||||
; SSE41-NEXT: pshufb %xmm4, %xmm5
|
||||
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
||||
; SSE41-NEXT: movdqa %xmm2, %xmm3
|
||||
; SSE41-NEXT: pand %xmm0, %xmm3
|
||||
; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
||||
; SSE41-NEXT: movdqa %xmm4, %xmm5
|
||||
; SSE41-NEXT: pshufb %xmm3, %xmm5
|
||||
; SSE41-NEXT: psrlw $4, %xmm2
|
||||
; SSE41-NEXT: pand %xmm3, %xmm2
|
||||
; SSE41-NEXT: pshufb %xmm2, %xmm0
|
||||
; SSE41-NEXT: paddb %xmm5, %xmm0
|
||||
; SSE41-NEXT: movdqa %xmm0, %xmm2
|
||||
; SSE41-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
|
||||
; SSE41-NEXT: psadbw %xmm1, %xmm2
|
||||
; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; SSE41-NEXT: pand %xmm0, %xmm2
|
||||
; SSE41-NEXT: pshufb %xmm2, %xmm4
|
||||
; SSE41-NEXT: paddb %xmm5, %xmm4
|
||||
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero
|
||||
; SSE41-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
|
||||
; SSE41-NEXT: psadbw %xmm1, %xmm4
|
||||
; SSE41-NEXT: psadbw %xmm1, %xmm0
|
||||
; SSE41-NEXT: packuswb %xmm2, %xmm0
|
||||
; SSE41-NEXT: packuswb %xmm4, %xmm0
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX1-LABEL: testv4i32u:
|
||||
|
@ -705,7 +702,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
|
|||
; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
|
||||
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
||||
; AVX1-NEXT: retq
|
||||
|
@ -727,7 +724,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
|
|||
; AVX2-NEXT: vpaddb %xmm3, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
|
||||
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
||||
; AVX2-NEXT: retq
|
||||
|
@ -759,22 +756,21 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
|
|||
; X32-SSE-NEXT: psubd %xmm0, %xmm2
|
||||
; X32-SSE-NEXT: pand %xmm0, %xmm2
|
||||
; X32-SSE-NEXT: psubd {{\.LCPI.*}}, %xmm2
|
||||
; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
||||
; X32-SSE-NEXT: movdqa %xmm2, %xmm4
|
||||
; X32-SSE-NEXT: pand %xmm3, %xmm4
|
||||
; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
||||
; X32-SSE-NEXT: movdqa %xmm0, %xmm5
|
||||
; X32-SSE-NEXT: pshufb %xmm4, %xmm5
|
||||
; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
||||
; X32-SSE-NEXT: movdqa %xmm2, %xmm3
|
||||
; X32-SSE-NEXT: pand %xmm0, %xmm3
|
||||
; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
||||
; X32-SSE-NEXT: movdqa %xmm4, %xmm5
|
||||
; X32-SSE-NEXT: pshufb %xmm3, %xmm5
|
||||
; X32-SSE-NEXT: psrlw $4, %xmm2
|
||||
; X32-SSE-NEXT: pand %xmm3, %xmm2
|
||||
; X32-SSE-NEXT: pshufb %xmm2, %xmm0
|
||||
; X32-SSE-NEXT: paddb %xmm5, %xmm0
|
||||
; X32-SSE-NEXT: movdqa %xmm0, %xmm2
|
||||
; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
|
||||
; X32-SSE-NEXT: psadbw %xmm1, %xmm2
|
||||
; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; X32-SSE-NEXT: pand %xmm0, %xmm2
|
||||
; X32-SSE-NEXT: pshufb %xmm2, %xmm4
|
||||
; X32-SSE-NEXT: paddb %xmm5, %xmm4
|
||||
; X32-SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero
|
||||
; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
|
||||
; X32-SSE-NEXT: psadbw %xmm1, %xmm4
|
||||
; X32-SSE-NEXT: psadbw %xmm1, %xmm0
|
||||
; X32-SSE-NEXT: packuswb %xmm2, %xmm0
|
||||
; X32-SSE-NEXT: packuswb %xmm4, %xmm0
|
||||
; X32-SSE-NEXT: retl
|
||||
%out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 -1)
|
||||
ret <4 x i32> %out
|
||||
|
|
|
@ -224,7 +224,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
|
|||
; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
|
||||
; AVX1-NEXT: vpsadbw %xmm1, %xmm5, %xmm5
|
||||
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
|
||||
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
|
||||
; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0
|
||||
|
@ -236,7 +236,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
|
|||
; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
|
||||
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
||||
|
@ -354,7 +354,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
|
|||
; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
|
||||
; AVX1-NEXT: vpsadbw %xmm1, %xmm5, %xmm5
|
||||
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
|
||||
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
|
||||
; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0
|
||||
|
@ -366,7 +366,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
|
|||
; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
|
||||
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
||||
|
|
|
@ -1636,7 +1636,7 @@ define <4 x i64> @shuf_zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone
|
|||
;
|
||||
; AVX1-LABEL: shuf_zext_4i32_to_4i64:
|
||||
; AVX1: # BB#0: # %entry
|
||||
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
|
||||
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
|
||||
; AVX1-NEXT: vxorpd %xmm2, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,0]
|
||||
|
@ -1892,18 +1892,30 @@ entry:
|
|||
}
|
||||
|
||||
define <4 x i32> @shuf_zext_8i16_to_4i32_offset1(<8 x i16> %A) nounwind uwtable readnone ssp {
|
||||
; SSE-LABEL: shuf_zext_8i16_to_4i32_offset1:
|
||||
; SSE: # BB#0: # %entry
|
||||
; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
|
||||
; SSE-NEXT: pxor %xmm1, %xmm1
|
||||
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; SSE-NEXT: retq
|
||||
; SSE2-LABEL: shuf_zext_8i16_to_4i32_offset1:
|
||||
; SSE2: # BB#0: # %entry
|
||||
; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
|
||||
; SSE2-NEXT: pxor %xmm1, %xmm1
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: shuf_zext_8i16_to_4i32_offset1:
|
||||
; SSSE3: # BB#0: # %entry
|
||||
; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
|
||||
; SSSE3-NEXT: pxor %xmm1, %xmm1
|
||||
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: shuf_zext_8i16_to_4i32_offset1:
|
||||
; SSE41: # BB#0: # %entry
|
||||
; SSE41-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
|
||||
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: shuf_zext_8i16_to_4i32_offset1:
|
||||
; AVX: # BB#0: # %entry
|
||||
; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
|
||||
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
||||
; AVX-NEXT: retq
|
||||
entry:
|
||||
%B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 8, i32 2, i32 8, i32 3, i32 8, i32 4, i32 8>
|
||||
|
|
Loading…
Reference in New Issue