[X86][SSE] Add support for combining target shuffles to SHUFPS.

As discussed on D27692, the next step will be to allow cross-domain shuffles once the combined shuffle depth passes a certain point.

llvm-svn: 290064
This commit is contained in:
Simon Pilgrim 2016-12-18 14:26:02 +00:00
parent ccfbf384ba
commit e940daf532
12 changed files with 201 additions and 115 deletions

View File

@ -7779,6 +7779,42 @@ is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
}
/// Test whether a target shuffle mask is equivalent within each sub-lane.
/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
ArrayRef<int> Mask,
SmallVectorImpl<int> &RepeatedMask) {
int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
RepeatedMask.assign(LaneSize, SM_SentinelUndef);
int Size = Mask.size();
for (int i = 0; i < Size; ++i) {
assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
if (Mask[i] == SM_SentinelUndef)
continue;
if (Mask[i] == SM_SentinelZero) {
if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
return false;
RepeatedMask[i % LaneSize] = SM_SentinelZero;
continue;
}
if ((Mask[i] % Size) / LaneSize != i / LaneSize)
// This entry crosses lanes, so there is no way to model this shuffle.
return false;
// Ok, handle the in-lane shuffles by detecting if and when they repeat.
// Adjust second vector indices to start at LaneSize instead of Size.
int LocalM =
Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
// This is the first non-undef entry in this slot of a 128-bit lane.
RepeatedMask[i % LaneSize] = LocalM;
else if (RepeatedMask[i % LaneSize] != LocalM)
// Found a mismatch with the repeated mask.
return false;
}
return true;
}
/// \brief Checks whether a shuffle mask is equivalent to an explicit list of
/// arguments.
///
@ -26274,6 +26310,50 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
}
}
// Attempt to combine to SHUFPS.
if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
(MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
(MaskVT == MVT::v16f32 && Subtarget.hasAVX512())) {
SmallVector<int, 4> RepeatedMask;
if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
int M0 = RepeatedMask[Offset];
int M1 = RepeatedMask[Offset + 1];
if (isUndefInRange(RepeatedMask, Offset, 2)) {
return DAG.getUNDEF(MaskVT);
} else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
S0 = (SM_SentinelUndef == M0 ? -1 : 0);
S1 = (SM_SentinelUndef == M1 ? -1 : 1);
return getZeroVector(MaskVT, Subtarget, DAG, DL);
} else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
return V1;
} else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
return V2;
}
return SDValue();
};
int ShufMask[4] = {-1, -1, -1, -1};
SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
if (Lo && Hi) {
V1 = Lo;
V2 = Hi;
Shuffle = X86ISD::SHUFP;
ShuffleVT = MaskVT;
PermuteImm = getV4X86ShuffleImm(ShufMask);
return true;
}
}
}
return false;
}
@ -27294,7 +27374,8 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
MVT VT = N.getSimpleValueType();
SmallVector<int, 4> Mask;
switch (N.getOpcode()) {
unsigned Opcode = N.getOpcode();
switch (Opcode) {
case X86ISD::PSHUFD:
case X86ISD::PSHUFLW:
case X86ISD::PSHUFHW:
@ -27369,6 +27450,31 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
return SDValue();
}
case X86ISD::MOVSD:
case X86ISD::MOVSS: {
bool isFloat = VT.isFloatingPoint();
SDValue V0 = peekThroughBitcasts(N->getOperand(0));
SDValue V1 = peekThroughBitcasts(N->getOperand(1));
bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
assert(!(isZero0 && isZero1) && "Zeroable shuffle detected.");
// We often lower to MOVSD/MOVSS from integer as well as native float
// types; remove unnecessary domain-crossing bitcasts if we can to make it
// easier to combine shuffles later on. We've already accounted for the
// domain switching cost when we decided to lower with it.
if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
: (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
V0 = DAG.getBitcast(NewVT, V0);
V1 = DAG.getBitcast(NewVT, V1);
return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
}
return SDValue();
}
case X86ISD::INSERTPS: {
assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
SDValue Op0 = N.getOperand(0);

View File

@ -84,8 +84,7 @@ define <2 x float> @sltof2f32(<2 x i64> %a) {
; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; KNL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; KNL-NEXT: retq
;
; SKX-LABEL: sltof2f32:

View File

@ -89,7 +89,7 @@ define i64 @t4(<2 x double>* %a) {
; X32-SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X32-SSE2-NEXT: movd %xmm1, %eax
; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; X32-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; X32-SSE2-NEXT: movd %xmm0, %edx
; X32-SSE2-NEXT: retl
;

View File

@ -9,7 +9,7 @@
define double @test1(double %A) {
; CHECK-LABEL: test1:
; CHECK: # BB#0:
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-NEXT: retq

View File

@ -1104,39 +1104,26 @@ define <4 x float> @merge_4f32_f32_2345_volatile(float* %ptr) nounwind uwtable n
define <4 x float> @merge_4f32_f32_X0YY(float* %ptr0, float* %ptr1) nounwind uwtable noinline ssp {
; SSE-LABEL: merge_4f32_f32_X0YY:
; SSE: # BB#0:
; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0,0,1,1]
; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; SSE-NEXT: retq
;
; AVX-LABEL: merge_4f32_f32_X0YY:
; AVX: # BB#0:
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0]
; AVX-NEXT: retq
;
; X32-SSE1-LABEL: merge_4f32_f32_X0YY:
; X32-SSE1: # BB#0:
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0,0,1,1]
; X32-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32-SSE1-NEXT: retl
;
; X32-SSE41-LABEL: merge_4f32_f32_X0YY:
; X32-SSE41: # BB#0:
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-SSE41-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0,0,1,1]
; X32-SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32-SSE41-NEXT: retl
; X32-SSE-LABEL: merge_4f32_f32_X0YY:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; X32-SSE-NEXT: retl
%val0 = load float, float* %ptr0, align 4
%val1 = load float, float* %ptr1, align 4
%res0 = insertelement <4 x float> undef, float %val0, i32 0

View File

@ -1100,48 +1100,46 @@ define void @interleave_24i16_in(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2,
define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind {
; SSE2-LABEL: interleave_24i32_out:
; SSE2: # BB#0:
; SSE2-NEXT: movdqu 80(%rdi), %xmm8
; SSE2-NEXT: movups 64(%rdi), %xmm10
; SSE2-NEXT: movups 80(%rdi), %xmm5
; SSE2-NEXT: movups 64(%rdi), %xmm8
; SSE2-NEXT: movups (%rdi), %xmm0
; SSE2-NEXT: movups 16(%rdi), %xmm7
; SSE2-NEXT: movdqu 32(%rdi), %xmm9
; SSE2-NEXT: movups 48(%rdi), %xmm2
; SSE2-NEXT: movaps %xmm2, %xmm3
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm10[2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,1,0,1]
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm3[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,0]
; SSE2-NEXT: movaps %xmm0, %xmm5
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm7[2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,1]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm5[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm7[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[3,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,1,2,2]
; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,0],xmm0[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,0]
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,0,1]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm10[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm10[3,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,2,2]
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm2[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0]
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,1,0,3]
; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,0,3]
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm6[0],xmm1[1]
; SSE2-NEXT: movups 16(%rdi), %xmm6
; SSE2-NEXT: movups 32(%rdi), %xmm2
; SSE2-NEXT: movups 48(%rdi), %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm3
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm8[2,3]
; SSE2-NEXT: movaps %xmm5, %xmm4
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,0]
; SSE2-NEXT: movaps %xmm0, %xmm4
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm6[2,3]
; SSE2-NEXT: movaps %xmm2, %xmm7
; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm4[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,0]
; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,3,0,1]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[3,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,1,0,3]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm8[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[3,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,1,0,3]
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0]
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1]
; SSE2-NEXT: movsd {{.*#+}} xmm9 = xmm10[0],xmm9[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
; SSE2-NEXT: movsd {{.*#+}} xmm7 = xmm2[0],xmm7[1]
; SSE2-NEXT: movups %xmm3, 16(%rsi)
; SSE2-NEXT: movups %xmm5, (%rsi)
; SSE2-NEXT: movups %xmm2, 16(%rdx)
; SSE2-NEXT: movups %xmm4, (%rsi)
; SSE2-NEXT: movups %xmm1, 16(%rdx)
; SSE2-NEXT: movups %xmm0, (%rdx)
; SSE2-NEXT: movupd %xmm1, 16(%rcx)
; SSE2-NEXT: movupd %xmm4, (%rcx)
; SSE2-NEXT: movupd %xmm7, 16(%rcx)
; SSE2-NEXT: movupd %xmm9, (%rcx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: interleave_24i32_out:

View File

@ -8,7 +8,7 @@ define i64 @PR30511(<2 x double> %a) {
; CHECK-LABEL: PR30511:
; CHECK: # BB#0:
; CHECK-NEXT: addpd {{.*}}(%rip), %xmm0
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-NEXT: cvtdq2pd %xmm0, %xmm0
; CHECK-NEXT: mulpd {{.*}}(%rip), %xmm0
; CHECK-NEXT: movd %xmm0, %rax

View File

@ -16,8 +16,8 @@ define i32 @test0(<1 x i64>* %v4) nounwind {
; X32-NEXT: movl %ecx, (%esp)
; X32-NEXT: pshufw $238, (%esp), %mm0 # mm0 = mem[2,3,2,3]
; X32-NEXT: movq %mm0, {{[0-9]+}}(%esp)
; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
; X32-NEXT: movd %xmm0, %eax
; X32-NEXT: addl $32, %eax
; X32-NEXT: movl %ebp, %esp
@ -55,8 +55,8 @@ define i32 @test1(i32* nocapture readonly %ptr) nounwind {
; X32-NEXT: movd (%eax), %mm0
; X32-NEXT: pshufw $232, %mm0, %mm0 # mm0 = mm0[0,2,2,3]
; X32-NEXT: movq %mm0, (%esp)
; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
; X32-NEXT: movd %xmm0, %eax
; X32-NEXT: emms
; X32-NEXT: movl %ebp, %esp
@ -98,8 +98,8 @@ define i32 @test2(i32* nocapture readonly %ptr) nounwind {
; X32-NEXT: movl 8(%ebp), %eax
; X32-NEXT: pshufw $232, (%eax), %mm0 # mm0 = mem[0,2,2,3]
; X32-NEXT: movq %mm0, (%esp)
; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
; X32-NEXT: movd %xmm0, %eax
; X32-NEXT: emms
; X32-NEXT: movl %ebp, %esp
@ -149,8 +149,8 @@ define i32 @test4(x86_mmx %a) nounwind {
; X32-NEXT: andl $-8, %esp
; X32-NEXT: subl $8, %esp
; X32-NEXT: movq %mm0, (%esp)
; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,0,1]
; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,0,1]
; X32-NEXT: movd %xmm0, %eax
; X32-NEXT: movl %ebp, %esp
; X32-NEXT: popl %ebp

View File

@ -1059,8 +1059,7 @@ define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) {
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; VEX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; VEX-NEXT: retq
;
; AVX512F-LABEL: sitofp_2i64_to_4f32:
@ -1071,8 +1070,7 @@ define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) {
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: sitofp_2i64_to_4f32:
@ -1083,8 +1081,7 @@ define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) {
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: sitofp_2i64_to_4f32:
@ -1185,8 +1182,7 @@ define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; VEX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; VEX-NEXT: retq
;
; AVX512F-LABEL: sitofp_4i64_to_4f32_undef:
@ -1197,8 +1193,7 @@ define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: sitofp_4i64_to_4f32_undef:
@ -1209,8 +1204,7 @@ define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: sitofp_4i64_to_4f32_undef:
@ -1673,8 +1667,7 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
; VEX-NEXT: # BB#7:
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
; VEX-NEXT: .LBB39_8:
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; VEX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; VEX-NEXT: retq
;
; AVX512F-LABEL: uitofp_2i64_to_4f32:
@ -1685,8 +1678,7 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_2i64_to_4f32:
@ -1697,8 +1689,7 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_2i64_to_4f32:
@ -1911,8 +1902,7 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; VEX-NEXT: # BB#7:
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
; VEX-NEXT: .LBB41_8:
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; VEX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; VEX-NEXT: retq
;
; AVX512F-LABEL: uitofp_4i64_to_4f32_undef:
@ -1923,8 +1913,7 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_4i64_to_4f32_undef:
@ -1935,8 +1924,7 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_4i64_to_4f32_undef:

View File

@ -1847,16 +1847,20 @@ define <4 x i32> @shuffle_v4i32_bitcast_0415(<4 x i32> %a, <4 x i32> %b) {
define <4 x float> @shuffle_v4f32_bitcast_4401(<4 x float> %a, <4 x i32> %b) {
; SSE-LABEL: shuffle_v4f32_bitcast_4401:
; SSE: # BB#0:
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,1]
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_bitcast_4401:
; AVX: # BB#0:
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX-NEXT: retq
; AVX1OR2-LABEL: shuffle_v4f32_bitcast_4401:
; AVX1OR2: # BB#0:
; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[0,1]
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4f32_bitcast_4401:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512VL-NEXT: retq
%1 = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
%2 = bitcast <4 x i32> %1 to <2 x double>
%3 = bitcast <4 x float> %a to <2 x double>

View File

@ -75,12 +75,16 @@ define <4 x float> @combine_vpermil2ps_identity(<4 x float> %a0, <4 x float> %a1
define <4 x float> @combine_vpermil2ps_1z74(<4 x float> %a0, <4 x float> %a1) {
; X32-LABEL: combine_vpermil2ps_1z74:
; X32: # BB#0:
; X32-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm0[1],zero,xmm1[3,0]
; X32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,0]
; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermil2ps_1z74:
; X64: # BB#0:
; X64-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm0[1],zero,xmm1[3,0]
; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,0]
; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; X64-NEXT: retq
%res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 1, i32 1, i32 7, i32 4>, i8 0)
%res1 = shufflevector <4 x float> %res0, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 7, i32 2, i32 3>

View File

@ -8,9 +8,9 @@ define void @test0(<1 x i64>* %x) {
; X32-LABEL: test0:
; X32: ## BB#0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X32-NEXT: movq %xmm0, (%eax)
; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X32-NEXT: movlps %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test0: