diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d976c57ac0f4..8856532ceeda 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -24602,23 +24602,27 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root, } // Attempt to blend with zero. - if (VT.getVectorNumElements() <= 8 && + if (NumMaskElts <= 8 && ((Subtarget.hasSSE41() && VT.is128BitVector()) || (Subtarget.hasAVX() && VT.is256BitVector()))) { // Convert VT to a type compatible with X86ISD::BLENDI. // TODO - add 16i16 support (requires lane duplication). - MVT ShuffleVT = VT; + bool FloatDomain = VT.isFloatingPoint(); + MVT ShuffleVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits) + : MVT::getIntegerVT(MaskEltSizeInBits); + ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts); + if (Subtarget.hasAVX2()) { - if (VT == MVT::v4i64) + if (ShuffleVT == MVT::v4i64) ShuffleVT = MVT::v8i32; - else if (VT == MVT::v2i64) + else if (ShuffleVT == MVT::v2i64) ShuffleVT = MVT::v4i32; } else { - if (VT == MVT::v2i64 || VT == MVT::v4i32) + if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32) ShuffleVT = MVT::v8i16; - else if (VT == MVT::v4i64) + else if (ShuffleVT == MVT::v4i64) ShuffleVT = MVT::v4f64; - else if (VT == MVT::v8i32) + else if (ShuffleVT == MVT::v8i32) ShuffleVT = MVT::v8f32; } diff --git a/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll index 039f5cd89ef5..e05451b80271 100644 --- a/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll +++ b/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll @@ -347,16 +347,11 @@ define <4 x i32> @_clearupper4xi32b(<4 x i32>) nounwind { ; SSE-NEXT: pinsrw $7, %eax, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: _clearupper4xi32b: -; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: _clearupper4xi32b: -; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[4,5],zero,zero,xmm0[8,9],zero,zero,xmm0[12,13],zero,zero -; AVX2-NEXT: retq +; AVX-LABEL: _clearupper4xi32b: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX-NEXT: retq %x16 = bitcast <4 x i32> %0 to <8 x i16> %r0 = insertelement <8 x i16> %x16, i16 zeroinitializer, i32 1 %r1 = insertelement <8 x i16> %r0, i16 zeroinitializer, i32 3 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll index 8ad8e136587e..f39aa4e93e79 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll @@ -107,7 +107,8 @@ define <16 x i8> @combine_vpperm_identity_bitcast(<16 x i8> %a0, <16 x i8> %a1) define <16 x i8> @combine_vpperm_as_blend_with_zero(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: combine_vpperm_as_blend_with_zero: ; CHECK: # BB#0: -; CHECK-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4,5,6,7] ; CHECK-NEXT: retq %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> ) ret <16 x i8> %res0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll index 6dbb8f62f6fa..7dfbd565223e 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll @@ -1182,26 +1182,47 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> % ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] ; SSE41-NEXT: retq ; -; AVX-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16: -; AVX: # BB#0: -; AVX-NEXT: movswq %di, %r10 -; AVX-NEXT: movswq %si, %r11 -; AVX-NEXT: movswq %dx, %rdx -; AVX-NEXT: movswq %cx, %rcx -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movswq %r8w, %rdi -; AVX-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movswq %r9w, %rax -; AVX-NEXT: movzwl -40(%rsp,%r10,2), %esi -; AVX-NEXT: vmovd %esi, %xmm0 -; AVX-NEXT: vpinsrw $1, -24(%rsp,%r11,2), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $2, -40(%rsp,%rdx,2), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $4, -40(%rsp,%rdi,2), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] -; AVX-NEXT: retq +; AVX1-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16: +; AVX1: # BB#0: +; AVX1-NEXT: movswq %di, %r10 +; AVX1-NEXT: movswq %si, %r11 +; AVX1-NEXT: movswq %dx, %rdx +; AVX1-NEXT: movswq %cx, %rcx +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movswq %r8w, %rdi +; AVX1-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movswq %r9w, %rax +; AVX1-NEXT: movzwl -40(%rsp,%r10,2), %esi +; AVX1-NEXT: vmovd %esi, %xmm0 +; AVX1-NEXT: vpinsrw $1, -24(%rsp,%r11,2), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $2, -40(%rsp,%rdx,2), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $4, -40(%rsp,%rdi,2), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16: +; AVX2: # BB#0: +; AVX2-NEXT: movswq %di, %r10 +; AVX2-NEXT: movswq %si, %r11 +; AVX2-NEXT: movswq %dx, %rdx +; AVX2-NEXT: movswq %cx, %rcx +; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movswq %r8w, %rdi +; AVX2-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movswq %r9w, %rax +; AVX2-NEXT: movzwl -40(%rsp,%r10,2), %esi +; AVX2-NEXT: vmovd %esi, %xmm0 +; AVX2-NEXT: vpinsrw $1, -24(%rsp,%r11,2), %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $2, -40(%rsp,%rdx,2), %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $4, -40(%rsp,%rdi,2), %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX2-NEXT: retq %x0 = extractelement <8 x i16> %x, i16 %i0 %y1 = extractelement <8 x i16> %y, i16 %i1 %x2 = extractelement <8 x i16> %x, i16 %i2