From 5ac75d5628a1758c37e5e2c4f6ee229d399f9331 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 9 Dec 2017 22:44:42 +0000 Subject: [PATCH] [X86] Improve lowering of vXi1 insert_subvectors to better utilize (insert_subvector zero, vec, 0) for zeroing upper bits. This can be better recognized during isel when the producer already zeroed the upper bits. llvm-svn: 320267 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 154 ++++++++++-------- .../CodeGen/X86/avx512-skx-insert-subvec.ll | 36 ++-- 2 files changed, 101 insertions(+), 89 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 9c5c40322daa..2b64ed756991 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -5013,6 +5013,10 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, if (!isa(Idx)) return SDValue(); + // Inserting undef is a nop. We can just return the original vector. + if (SubVec.isUndef()) + return Vec; + unsigned IdxVal = cast(Idx)->getZExtValue(); if (IdxVal == 0 && Vec.isUndef()) // the operation is legal return Op; @@ -5020,19 +5024,21 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, MVT OpVT = Op.getSimpleValueType(); unsigned NumElems = OpVT.getVectorNumElements(); + SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); + + // Extend to natively supported kshift. + MVT WideOpVT = OpVT; + if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) + WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; + // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts // if necessary. if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) { - if ((!Subtarget.hasDQI() && NumElems == 8) || (NumElems < 8)) { - // Need to promote to v16i1, do the insert, then extract back. - Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1, - getZeroVector(MVT::v16i1, Subtarget, DAG, dl), - SubVec, Idx); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, - DAG.getIntPtrConstant(0, dl)); - } - - return Op; + // May need to promote to a legal type. + Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, + getZeroVector(WideOpVT, Subtarget, DAG, dl), + SubVec, Idx); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); } MVT SubVecVT = SubVec.getSimpleValueType(); @@ -5042,30 +5048,32 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, IdxVal % SubVecVT.getSizeInBits() == 0 && "Unexpected index value in INSERT_SUBVECTOR"); - // extend to natively supported kshift - MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; - MVT WideOpVT = OpVT; - if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits()) - WideOpVT = MinVT; - - SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); SDValue Undef = DAG.getUNDEF(WideOpVT); - SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, - Undef, SubVec, ZeroIdx); - // Extract sub-vector if require. - auto ExtractSubVec = [&](SDValue V) { - return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, - OpVT, V, ZeroIdx); - }; + if (IdxVal == 0) { + // Zero lower bits of the Vec + SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); + Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, + ZeroIdx); + Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); + // Merge them together, SubVec should be zero extended. + SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, + getZeroVector(WideOpVT, Subtarget, DAG, dl), + SubVec, ZeroIdx); + Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, + ZeroIdx); + } + + SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, + Undef, SubVec, ZeroIdx); if (Vec.isUndef()) { - if (IdxVal != 0) { - SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8); - WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec, - ShiftBits); - } - return ExtractSubVec(WideSubVec); + assert(IdxVal != 0 && "Unexpected index"); + Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, + DAG.getConstant(IdxVal, dl, MVT::i8)); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); } if (ISD::isBuildVectorAllZeros(Vec.getNode())) { @@ -5073,48 +5081,60 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, NumElems = WideOpVT.getVectorNumElements(); unsigned ShiftLeft = NumElems - SubVecNumElems; unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; - Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec, - DAG.getConstant(ShiftLeft, dl, MVT::i8)); - Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, - DAG.getConstant(ShiftRight, dl, MVT::i8)); - return ExtractSubVec(Vec); - } - - if (IdxVal == 0) { - // Zero lower bits of the Vec - SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); - Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); - Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); - Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); - // Merge them together, SubVec should be zero extended. - WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, - getZeroVector(WideOpVT, Subtarget, DAG, dl), - SubVec, ZeroIdx); - Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec); - return ExtractSubVec(Vec); + SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, + DAG.getConstant(ShiftLeft, dl, MVT::i8)); + Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op, + DAG.getConstant(ShiftRight, dl, MVT::i8)); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); } // Simple case when we put subvector in the upper part if (IdxVal + SubVecNumElems == NumElems) { - // Zero upper bits of the Vec - WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec, - DAG.getConstant(IdxVal, dl, MVT::i8)); - NumElems = WideOpVT.getVectorNumElements(); - SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8); - Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); - Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); - Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); - Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec); - return ExtractSubVec(Vec); + SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, + DAG.getConstant(IdxVal, dl, MVT::i8)); + if (SubVecNumElems * 2 == NumElems) { + // Special case, use legal zero extending insert_subvector. This allows + // isel to opimitize when bits are known zero. + Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx); + Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, + getZeroVector(WideOpVT, Subtarget, DAG, dl), + Vec, ZeroIdx); + } else { + // Otherwise use explicit shifts to zero the bits. + Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, + Undef, Vec, ZeroIdx); + NumElems = WideOpVT.getVectorNumElements(); + SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8); + Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); + } + Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); } - // Subvector should be inserted in the middle - use shuffle - WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, - SubVec, ZeroIdx); - SmallVector Mask; - for (unsigned i = 0; i < NumElems; ++i) - Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ? - i : i + NumElems); - return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask); + + // Inserting into the middle is more complicated. + + NumElems = WideOpVT.getVectorNumElements(); + + // Widen the vector if needed. + Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); + // Move the current value of the bit to be replace to the lsbs. + Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, + DAG.getConstant(IdxVal, dl, MVT::i8)); + // Xor with the new bit. + Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec); + // Shift to MSB, filling bottom bits with 0. + unsigned ShiftLeft = NumElems - SubVecNumElems; + Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op, + DAG.getConstant(ShiftLeft, dl, MVT::i8)); + // Shift to the final position, filling upper bits with 0. + unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; + Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op, + DAG.getConstant(ShiftRight, dl, MVT::i8)); + // Xor with original vector leaving the new value. + Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op); + // Reduce to original width if needed. + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); } /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 diff --git a/llvm/test/CodeGen/X86/avx512-skx-insert-subvec.ll b/llvm/test/CodeGen/X86/avx512-skx-insert-subvec.ll index 4421fc096732..1182bbf94ec5 100644 --- a/llvm/test/CodeGen/X86/avx512-skx-insert-subvec.ll +++ b/llvm/test/CodeGen/X86/avx512-skx-insert-subvec.ll @@ -56,14 +56,12 @@ define <8 x i1> @test3(<4 x i1> %a) { define <8 x i1> @test4(<4 x i1> %a, <4 x i1>%b) { ; CHECK-LABEL: test4: ; CHECK: # %bb.0: +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k0 ; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0 -; CHECK-NEXT: vpslld $31, %xmm1, %xmm0 ; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1 -; CHECK-NEXT: kshiftlb $4, %k1, %k1 ; CHECK-NEXT: kshiftlb $4, %k0, %k0 -; CHECK-NEXT: kshiftrb $4, %k0, %k0 -; CHECK-NEXT: korb %k1, %k0, %k0 +; CHECK-NEXT: korb %k0, %k1, %k0 ; CHECK-NEXT: vpmovm2w %k0, %xmm0 ; CHECK-NEXT: retq @@ -74,14 +72,12 @@ define <8 x i1> @test4(<4 x i1> %a, <4 x i1>%b) { define <4 x i1> @test5(<2 x i1> %a, <2 x i1>%b) { ; CHECK-LABEL: test5: ; CHECK: # %bb.0: +; CHECK-NEXT: vpsllq $63, %xmm1, %xmm1 +; CHECK-NEXT: vptestmq %xmm1, %xmm1, %k0 ; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0 -; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0 -; CHECK-NEXT: vpsllq $63, %xmm1, %xmm0 ; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k1 -; CHECK-NEXT: kshiftlb $2, %k1, %k1 -; CHECK-NEXT: kshiftlb $6, %k0, %k0 -; CHECK-NEXT: kshiftrb $6, %k0, %k0 -; CHECK-NEXT: korb %k1, %k0, %k0 +; CHECK-NEXT: kshiftlb $2, %k0, %k0 +; CHECK-NEXT: korb %k0, %k1, %k0 ; CHECK-NEXT: vpmovm2d %k0, %xmm0 ; CHECK-NEXT: retq @@ -92,14 +88,12 @@ define <4 x i1> @test5(<2 x i1> %a, <2 x i1>%b) { define <16 x i1> @test6(<2 x i1> %a, <2 x i1>%b) { ; CHECK-LABEL: test6: ; CHECK: # %bb.0: +; CHECK-NEXT: vpsllq $63, %xmm1, %xmm1 +; CHECK-NEXT: vptestmq %xmm1, %xmm1, %k0 ; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0 -; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0 -; CHECK-NEXT: vpsllq $63, %xmm1, %xmm0 ; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k1 -; CHECK-NEXT: kshiftlb $2, %k1, %k1 -; CHECK-NEXT: kshiftlb $6, %k0, %k0 -; CHECK-NEXT: kshiftrb $6, %k0, %k0 -; CHECK-NEXT: korb %k1, %k0, %k0 +; CHECK-NEXT: kshiftlb $2, %k0, %k0 +; CHECK-NEXT: korb %k0, %k1, %k0 ; CHECK-NEXT: vpmovm2b %k0, %xmm0 ; CHECK-NEXT: retq @@ -110,14 +104,12 @@ define <16 x i1> @test6(<2 x i1> %a, <2 x i1>%b) { define <32 x i1> @test7(<4 x i1> %a, <4 x i1>%b) { ; CHECK-LABEL: test7: ; CHECK: # %bb.0: +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k0 ; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0 -; CHECK-NEXT: vpslld $31, %xmm1, %xmm0 ; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1 -; CHECK-NEXT: kshiftlb $4, %k1, %k1 ; CHECK-NEXT: kshiftlb $4, %k0, %k0 -; CHECK-NEXT: kshiftrb $4, %k0, %k0 -; CHECK-NEXT: korb %k1, %k0, %k0 +; CHECK-NEXT: korb %k0, %k1, %k0 ; CHECK-NEXT: vpmovm2b %k0, %ymm0 ; CHECK-NEXT: retq