diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 351a22c46feb..e29c7cd6e3b4 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7779,6 +7779,42 @@ is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask, return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask); } +/// Test whether a target shuffle mask is equivalent within each sub-lane. +/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero. +static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT, + ArrayRef Mask, + SmallVectorImpl &RepeatedMask) { + int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits(); + RepeatedMask.assign(LaneSize, SM_SentinelUndef); + int Size = Mask.size(); + for (int i = 0; i < Size; ++i) { + assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0)); + if (Mask[i] == SM_SentinelUndef) + continue; + if (Mask[i] == SM_SentinelZero) { + if (!isUndefOrZero(RepeatedMask[i % LaneSize])) + return false; + RepeatedMask[i % LaneSize] = SM_SentinelZero; + continue; + } + if ((Mask[i] % Size) / LaneSize != i / LaneSize) + // This entry crosses lanes, so there is no way to model this shuffle. + return false; + + // Ok, handle the in-lane shuffles by detecting if and when they repeat. + // Adjust second vector indices to start at LaneSize instead of Size. + int LocalM = + Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize; + if (RepeatedMask[i % LaneSize] == SM_SentinelUndef) + // This is the first non-undef entry in this slot of a 128-bit lane. + RepeatedMask[i % LaneSize] = LocalM; + else if (RepeatedMask[i % LaneSize] != LocalM) + // Found a mismatch with the repeated mask. + return false; + } + return true; +} + /// \brief Checks whether a shuffle mask is equivalent to an explicit list of /// arguments. /// @@ -26274,6 +26310,50 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef Mask, } } + // Attempt to combine to SHUFPS. + if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) || + (MaskVT == MVT::v8f32 && Subtarget.hasAVX()) || + (MaskVT == MVT::v16f32 && Subtarget.hasAVX512())) { + SmallVector RepeatedMask; + if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) { + auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) { + int M0 = RepeatedMask[Offset]; + int M1 = RepeatedMask[Offset + 1]; + + if (isUndefInRange(RepeatedMask, Offset, 2)) { + return DAG.getUNDEF(MaskVT); + } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) { + S0 = (SM_SentinelUndef == M0 ? -1 : 0); + S1 = (SM_SentinelUndef == M1 ? -1 : 1); + return getZeroVector(MaskVT, Subtarget, DAG, DL); + } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) { + S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3); + S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3); + return V1; + } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) { + S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3); + S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3); + return V2; + } + + return SDValue(); + }; + + int ShufMask[4] = {-1, -1, -1, -1}; + SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]); + SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]); + + if (Lo && Hi) { + V1 = Lo; + V2 = Hi; + Shuffle = X86ISD::SHUFP; + ShuffleVT = MaskVT; + PermuteImm = getV4X86ShuffleImm(ShufMask); + return true; + } + } + } + return false; } @@ -27294,7 +27374,8 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, MVT VT = N.getSimpleValueType(); SmallVector Mask; - switch (N.getOpcode()) { + unsigned Opcode = N.getOpcode(); + switch (Opcode) { case X86ISD::PSHUFD: case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: @@ -27369,6 +27450,31 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, return SDValue(); } + case X86ISD::MOVSD: + case X86ISD::MOVSS: { + bool isFloat = VT.isFloatingPoint(); + SDValue V0 = peekThroughBitcasts(N->getOperand(0)); + SDValue V1 = peekThroughBitcasts(N->getOperand(1)); + bool isFloat0 = V0.getSimpleValueType().isFloatingPoint(); + bool isFloat1 = V1.getSimpleValueType().isFloatingPoint(); + bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode()); + bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode()); + assert(!(isZero0 && isZero1) && "Zeroable shuffle detected."); + + // We often lower to MOVSD/MOVSS from integer as well as native float + // types; remove unnecessary domain-crossing bitcasts if we can to make it + // easier to combine shuffles later on. We've already accounted for the + // domain switching cost when we decided to lower with it. + if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) { + MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32) + : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32); + V0 = DAG.getBitcast(NewVT, V0); + V1 = DAG.getBitcast(NewVT, V1); + return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1)); + } + + return SDValue(); + } case X86ISD::INSERTPS: { assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"); SDValue Op0 = N.getOperand(0); @@ -28275,7 +28381,7 @@ static SDValue combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, return SDValue(); bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); - // Check if the first operand is all zeros.This situation only + // Check if the first operand is all zeros.This situation only // applies to avx512. if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse()) { //Invert the cond to not(cond) : xor(op,allones)=not(op) diff --git a/llvm/test/CodeGen/X86/avx512-cvt.ll b/llvm/test/CodeGen/X86/avx512-cvt.ll index 884362edcf1a..1d9cb7e7824d 100644 --- a/llvm/test/CodeGen/X86/avx512-cvt.ll +++ b/llvm/test/CodeGen/X86/avx512-cvt.ll @@ -84,8 +84,7 @@ define <2 x float> @sltof2f32(<2 x i64> %a) { ; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 -; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; KNL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] ; KNL-NEXT: retq ; ; SKX-LABEL: sltof2f32: diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll index 5855303e1278..c3542bff4ccc 100644 --- a/llvm/test/CodeGen/X86/extractelement-load.ll +++ b/llvm/test/CodeGen/X86/extractelement-load.ll @@ -89,7 +89,7 @@ define i64 @t4(<2 x double>* %a) { ; X32-SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] ; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X32-SSE2-NEXT: movd %xmm1, %eax -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; X32-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; X32-SSE2-NEXT: movd %xmm0, %edx ; X32-SSE2-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/lower-bitcast.ll b/llvm/test/CodeGen/X86/lower-bitcast.ll index 426341dc90f0..62020c2d1914 100644 --- a/llvm/test/CodeGen/X86/lower-bitcast.ll +++ b/llvm/test/CodeGen/X86/lower-bitcast.ll @@ -9,7 +9,7 @@ define double @test1(double %A) { ; CHECK-LABEL: test1: ; CHECK: # BB#0: -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll index 5880c5c51e1b..003e2e60521b 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -1104,39 +1104,26 @@ define <4 x float> @merge_4f32_f32_2345_volatile(float* %ptr) nounwind uwtable n define <4 x float> @merge_4f32_f32_X0YY(float* %ptr0, float* %ptr1) nounwind uwtable noinline ssp { ; SSE-LABEL: merge_4f32_f32_X0YY: ; SSE: # BB#0: -; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] ; SSE-NEXT: retq ; ; AVX-LABEL: merge_4f32_f32_X0YY: ; AVX: # BB#0: ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0] ; AVX-NEXT: retq ; -; X32-SSE1-LABEL: merge_4f32_f32_X0YY: -; X32-SSE1: # BB#0: -; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0,0,1,1] -; X32-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X32-SSE1-NEXT: retl -; -; X32-SSE41-LABEL: merge_4f32_f32_X0YY: -; X32-SSE41: # BB#0: -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-SSE41-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0,0,1,1] -; X32-SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X32-SSE41-NEXT: retl +; X32-SSE-LABEL: merge_4f32_f32_X0YY: +; X32-SSE: # BB#0: +; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; X32-SSE-NEXT: retl %val0 = load float, float* %ptr0, align 4 %val1 = load float, float* %ptr1, align 4 %res0 = insertelement <4 x float> undef, float %val0, i32 0 diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll index 1047815b58a3..952db42842ef 100644 --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -1100,48 +1100,46 @@ define void @interleave_24i16_in(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2, define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind { ; SSE2-LABEL: interleave_24i32_out: ; SSE2: # BB#0: -; SSE2-NEXT: movdqu 80(%rdi), %xmm8 -; SSE2-NEXT: movups 64(%rdi), %xmm10 +; SSE2-NEXT: movups 80(%rdi), %xmm5 +; SSE2-NEXT: movups 64(%rdi), %xmm8 ; SSE2-NEXT: movups (%rdi), %xmm0 -; SSE2-NEXT: movups 16(%rdi), %xmm7 -; SSE2-NEXT: movdqu 32(%rdi), %xmm9 -; SSE2-NEXT: movups 48(%rdi), %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm10[2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,1,0,1] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm3[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,0] -; SSE2-NEXT: movaps %xmm0, %xmm5 -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm7[2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,1] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm5[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm7[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,1,2,2] -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,0],xmm0[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,0,1] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm10[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm10[3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,2,2] -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm2[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,1,0,3] -; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,0,3] -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm6[0],xmm1[1] +; SSE2-NEXT: movups 16(%rdi), %xmm6 +; SSE2-NEXT: movups 32(%rdi), %xmm2 +; SSE2-NEXT: movups 48(%rdi), %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm8[2,3] +; SSE2-NEXT: movaps %xmm5, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,0] +; SSE2-NEXT: movaps %xmm0, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm6[2,3] +; SSE2-NEXT: movaps %xmm2, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm4[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,3,0,1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,1,0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm8[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,1,0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; SSE2-NEXT: movsd {{.*#+}} xmm9 = xmm10[0],xmm9[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE2-NEXT: movsd {{.*#+}} xmm7 = xmm2[0],xmm7[1] ; SSE2-NEXT: movups %xmm3, 16(%rsi) -; SSE2-NEXT: movups %xmm5, (%rsi) -; SSE2-NEXT: movups %xmm2, 16(%rdx) +; SSE2-NEXT: movups %xmm4, (%rsi) +; SSE2-NEXT: movups %xmm1, 16(%rdx) ; SSE2-NEXT: movups %xmm0, (%rdx) -; SSE2-NEXT: movupd %xmm1, 16(%rcx) -; SSE2-NEXT: movupd %xmm4, (%rcx) +; SSE2-NEXT: movupd %xmm7, 16(%rcx) +; SSE2-NEXT: movupd %xmm9, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: interleave_24i32_out: diff --git a/llvm/test/CodeGen/X86/pr30511.ll b/llvm/test/CodeGen/X86/pr30511.ll index c4f7cea79900..053ae013b451 100644 --- a/llvm/test/CodeGen/X86/pr30511.ll +++ b/llvm/test/CodeGen/X86/pr30511.ll @@ -8,7 +8,7 @@ define i64 @PR30511(<2 x double> %a) { ; CHECK-LABEL: PR30511: ; CHECK: # BB#0: ; CHECK-NEXT: addpd {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: cvtdq2pd %xmm0, %xmm0 ; CHECK-NEXT: mulpd {{.*}}(%rip), %xmm0 ; CHECK-NEXT: movd %xmm0, %rax diff --git a/llvm/test/CodeGen/X86/vec_extract-mmx.ll b/llvm/test/CodeGen/X86/vec_extract-mmx.ll index 374ae0c4f8b6..ed957728aeff 100644 --- a/llvm/test/CodeGen/X86/vec_extract-mmx.ll +++ b/llvm/test/CodeGen/X86/vec_extract-mmx.ll @@ -16,8 +16,8 @@ define i32 @test0(<1 x i64>* %v4) nounwind { ; X32-NEXT: movl %ecx, (%esp) ; X32-NEXT: pshufw $238, (%esp), %mm0 # mm0 = mem[2,3,2,3] ; X32-NEXT: movq %mm0, {{[0-9]+}}(%esp) -; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; X32-NEXT: movd %xmm0, %eax ; X32-NEXT: addl $32, %eax ; X32-NEXT: movl %ebp, %esp @@ -55,8 +55,8 @@ define i32 @test1(i32* nocapture readonly %ptr) nounwind { ; X32-NEXT: movd (%eax), %mm0 ; X32-NEXT: pshufw $232, %mm0, %mm0 # mm0 = mm0[0,2,2,3] ; X32-NEXT: movq %mm0, (%esp) -; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; X32-NEXT: movd %xmm0, %eax ; X32-NEXT: emms ; X32-NEXT: movl %ebp, %esp @@ -98,8 +98,8 @@ define i32 @test2(i32* nocapture readonly %ptr) nounwind { ; X32-NEXT: movl 8(%ebp), %eax ; X32-NEXT: pshufw $232, (%eax), %mm0 # mm0 = mem[0,2,2,3] ; X32-NEXT: movq %mm0, (%esp) -; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; X32-NEXT: movd %xmm0, %eax ; X32-NEXT: emms ; X32-NEXT: movl %ebp, %esp @@ -149,8 +149,8 @@ define i32 @test4(x86_mmx %a) nounwind { ; X32-NEXT: andl $-8, %esp ; X32-NEXT: subl $8, %esp ; X32-NEXT: movq %mm0, (%esp) -; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,0,1] +; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,0,1] ; X32-NEXT: movd %xmm0, %eax ; X32-NEXT: movl %ebp, %esp ; X32-NEXT: popl %ebp diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index 1b61de9d9b23..af6b4179cdfe 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -1059,8 +1059,7 @@ define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) { ; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; VEX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] ; VEX-NEXT: retq ; ; AVX512F-LABEL: sitofp_2i64_to_4f32: @@ -1071,8 +1070,7 @@ define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) { ; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: sitofp_2i64_to_4f32: @@ -1083,8 +1081,7 @@ define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) { ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: sitofp_2i64_to_4f32: @@ -1185,8 +1182,7 @@ define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; VEX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] ; VEX-NEXT: retq ; ; AVX512F-LABEL: sitofp_4i64_to_4f32_undef: @@ -1197,8 +1193,7 @@ define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: sitofp_4i64_to_4f32_undef: @@ -1209,8 +1204,7 @@ define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: sitofp_4i64_to_4f32_undef: @@ -1673,8 +1667,7 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) { ; VEX-NEXT: # BB#7: ; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 ; VEX-NEXT: .LBB39_8: -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; VEX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] ; VEX-NEXT: retq ; ; AVX512F-LABEL: uitofp_2i64_to_4f32: @@ -1685,8 +1678,7 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) { ; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: uitofp_2i64_to_4f32: @@ -1697,8 +1689,7 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) { ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: uitofp_2i64_to_4f32: @@ -1911,8 +1902,7 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; VEX-NEXT: # BB#7: ; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 ; VEX-NEXT: .LBB41_8: -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; VEX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] ; VEX-NEXT: retq ; ; AVX512F-LABEL: uitofp_4i64_to_4f32_undef: @@ -1923,8 +1913,7 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: uitofp_4i64_to_4f32_undef: @@ -1935,8 +1924,7 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: uitofp_4i64_to_4f32_undef: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll index f71108a12abd..d11dac9b923b 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -1847,16 +1847,20 @@ define <4 x i32> @shuffle_v4i32_bitcast_0415(<4 x i32> %a, <4 x i32> %b) { define <4 x float> @shuffle_v4f32_bitcast_4401(<4 x float> %a, <4 x i32> %b) { ; SSE-LABEL: shuffle_v4f32_bitcast_4401: ; SSE: # BB#0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,1] +; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v4f32_bitcast_4401: -; AVX: # BB#0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: retq +; AVX1OR2-LABEL: shuffle_v4f32_bitcast_4401: +; AVX1OR2: # BB#0: +; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[0,1] +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f32_bitcast_4401: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: retq %1 = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> %2 = bitcast <4 x i32> %1 to <2 x double> %3 = bitcast <4 x float> %a to <2 x double> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll index 8c962b682cd4..23e40a6572af 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll @@ -75,12 +75,16 @@ define <4 x float> @combine_vpermil2ps_identity(<4 x float> %a0, <4 x float> %a1 define <4 x float> @combine_vpermil2ps_1z74(<4 x float> %a0, <4 x float> %a1) { ; X32-LABEL: combine_vpermil2ps_1z74: ; X32: # BB#0: -; X32-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm0[1],zero,xmm1[3,0] +; X32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,0] +; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermil2ps_1z74: ; X64: # BB#0: -; X64-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm0[1],zero,xmm1[3,0] +; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,0] +; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; X64-NEXT: retq %res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> , i8 0) %res1 = shufflevector <4 x float> %res0, <4 x float> zeroinitializer, <4 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll b/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll index c6ab22295b3d..cfad89ec6fa4 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll @@ -8,9 +8,9 @@ define void @test0(<1 x i64>* %x) { ; X32-LABEL: test0: ; X32: ## BB#0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X32-NEXT: movq %xmm0, (%eax) +; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X32-NEXT: movlps %xmm0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: test0: