diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 8ae0df6313af..b5a4159a48ba 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7542,9 +7542,11 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0); narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1); for (int i = 0; i != (int)MaskSize; ++i) { - if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef) - Mask.push_back(SM_SentinelUndef); - else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero) + // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite + // loops converting between OR and BLEND shuffles due to + // canWidenShuffleElements merging away undef elements, meaning we + // fail to recognise the OR as the undef element isn't known zero. + if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero) Mask.push_back(SM_SentinelZero); else if (Mask1[i] == SM_SentinelZero) Mask.push_back(i); diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll index 123fba437141..9c32d75488dc 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -1245,10 +1245,10 @@ define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind { define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind { ; AVX1-LABEL: negative: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u],zero,zero,zero,zero,zero,zero,zero,xmm0[0,2,4,6,8,10,12,14] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll index ad177094f1ac..c88f8e47b0e5 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1651,8 +1651,9 @@ define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) { ; ; SSSE3-LABEL: shuffle_v8i16_XX4X8acX: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,0,1,4,5,8,9,u,u] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,8,9,u,u],zero,zero,zero,zero,zero,zero,xmm0[u,u] +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[u,u,0,1,4,5,8,9,u,u] +; SSSE3-NEXT: por %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v8i16_XX4X8acX: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index a9d9798ebc7a..dc54a60d33cc 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -3254,9 +3254,9 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_ ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[u,u],zero,zero,xmm2[12],zero,xmm2[u,u,u],zero,zero,xmm2[u,0,3] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,13,u,u,3,3],zero,xmm1[8,u,u,u,12,1,u],zero,zero ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5],xmm0[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u],zero,zero,xmm4[u,u,u,u,1,6,13,u,u],zero,xmm4[u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u],zero,zero,zero,xmm0[u,u,12,u,u] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255] ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll index a4755845b111..d741f2f9f36b 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll @@ -21,3 +21,45 @@ define <16 x i8> @combine_vpshufb_as_movzx(<16 x i8> %a0) { %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) ret <16 x i8> %res0 } + +define <16 x i8> @PR50049(<48 x i8>* %p1, <48 x i8>* %p2) { +; SSE-LABEL: PR50049: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm4 +; SSE-NEXT: movdqa 16(%rsi), %xmm5 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = <128,128,128,128,128,128,2,5,8,11,14,u,u,u,u,u> +; SSE-NEXT: pshufb %xmm6, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = <0,3,6,9,12,15,128,128,128,128,128,u,u,u,u,u> +; SSE-NEXT: pshufb %xmm7, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshufb %xmm6, %xmm5 +; SSE-NEXT: pshufb %xmm7, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; SSE-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; SSE-NEXT: pmullw %xmm5, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = <8,u,9,u,10,u,128,u,128,u,128,u,128,u,128,u> +; SSE-NEXT: pshufb %xmm6, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = <128,u,128,u,128,u,1,u,4,u,7,u,10,u,13,u> +; SSE-NEXT: pshufb %xmm7, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pshufb %xmm6, %xmm2 +; SSE-NEXT: pshufb %xmm7, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pmullw %xmm3, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: retq + %x1 = load <48 x i8>, <48 x i8>* %p1, align 16 + %x2 = load <48 x i8>, <48 x i8>* %p2, align 16 + %s1 = shufflevector <48 x i8> %x1, <48 x i8> poison, <16 x i32> + %s2 = shufflevector <48 x i8> %x2, <48 x i8> poison, <16 x i32> + %r = mul <16 x i8> %s1, %s2 + ret <16 x i8> %r +}