diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 47a66c55b80a..1fb398386e98 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -6691,12 +6691,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || (ExtVT == MVT::i64 && Subtarget.is64Bit())) { - if (VT.is512BitVector()) { - SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); - return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, - Item, DAG.getIntPtrConstant(0, dl)); - } - assert((VT.is128BitVector() || VT.is256BitVector()) && + assert((VT.is128BitVector() || VT.is256BitVector() || + VT.is512BitVector()) && "Expected an SSE value type!"); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. @@ -12091,6 +12087,16 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef Mask, assert(Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"); + // If we have a single input to the zero element, insert that into V1 if we + // can do so cheaply. + int NumElts = Mask.size(); + int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; }); + + if (NumV2Elements == 1 && Mask[0] >= NumElts) + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + DL, VT, V1, V2, Mask, Subtarget, DAG)) + return Insertion; + // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG)) diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 893078c299c3..86afd627ed84 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -3305,6 +3305,10 @@ let Predicates = [HasAVX512] in { def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>; + + def : Pat<(v8i64 (X86vzmovl (insert_subvector undef, + (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), + (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>; } // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part. let AddedComplexity = 20 in { @@ -3330,6 +3334,10 @@ let Predicates = [HasAVX512] in { (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>; + def : Pat<(v16i32 (X86vzmovl (insert_subvector undef, + (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>; + // Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext. def : Pat<(v8i64 (X86vzload addr:$src)), (SUBREG_TO_REG (i64 0), (VMOVZPQILo2PQIZrm addr:$src), sub_xmm)>; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll index 6cd0366d5adf..7172d53c78f2 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -297,3 +297,14 @@ define <16 x float> @shuffle_v16f32_00_01_10_10_04_05_14_14_08_09_18_18_0c_0d_1c %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> ret <16 x float> %shuffle } + +define <16 x i32> @insert_mem_and_zero_v16i32(i32* %ptr) { +; ALL-LABEL: insert_mem_and_zero_v16i32: +; ALL: # BB#0: +; ALL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; ALL-NEXT: retq + %a = load i32, i32* %ptr + %v = insertelement <16 x i32> undef, i32 %a, i32 0 + %shuffle = shufflevector <16 x i32> %v, <16 x i32> zeroinitializer, <16 x i32> + ret <16 x i32> %shuffle +} diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll index 6e2df62d5edf..c48821b639f8 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -147,20 +147,15 @@ define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_70000000: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512F-NEXT: movl $7, %eax -; AVX512F-NEXT: vpinsrq $0, %rax, %xmm1, %xmm2 -; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 +; AVX512F-NEXT: vmovq %rax, %xmm1 ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_70000000: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-32-NEXT: movl $7, %eax -; AVX512F-32-NEXT: vpinsrd $0, %eax, %xmm1, %xmm1 -; AVX512F-32-NEXT: vpxord %zmm2, %zmm2, %zmm2 -; AVX512F-32-NEXT: vinserti32x4 $0, %xmm1, %zmm2, %zmm1 +; AVX512F-32-NEXT: vmovd %eax, %xmm1 ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -1116,20 +1111,15 @@ define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) { ; ; AVX512F-LABEL: shuffle_v8i64_70000000: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512F-NEXT: movl $7, %eax -; AVX512F-NEXT: vpinsrq $0, %rax, %xmm1, %xmm2 -; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 +; AVX512F-NEXT: vmovq %rax, %xmm1 ; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_70000000: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-32-NEXT: movl $7, %eax -; AVX512F-32-NEXT: vpinsrd $0, %eax, %xmm1, %xmm1 -; AVX512F-32-NEXT: vpxord %zmm2, %zmm2, %zmm2 -; AVX512F-32-NEXT: vinserti32x4 $0, %xmm1, %zmm2, %zmm1 +; AVX512F-32-NEXT: vmovd %eax, %xmm1 ; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32>