[X86] Rewrite LowerAVXCONCAT_VECTORS similar to how we handle vXi1 concats.

This better able to detect undef and zeros pieces in the concat. Or cases when only one subvector is non-zero. This allows us to avoid silly things like double inserts into progressively larger undefs. This still builds 512 bit concats of 128 bits by building up through 256 bits first. But I don't know if that's best. We probably want to merge this with the vXi1 concat code since they are very similar. llvm-svn: 327454
2018-03-13 22:05:25 +00:00 · 2018-03-13 22:05:25 +00:00 · cc060e921b
parent bc32433062
commit cc060e921b
4 changed files with 53 additions and 57 deletions
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -5075,12 +5075,6 @@ static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
  return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
 }

-static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
-                                  SelectionDAG &DAG, const SDLoc &dl) {
-  assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
-  return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
-}
-
 /// Widen a vector to a larger size with the same scalar type, with the new
 /// elements either zero or undef.
 static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
@ -5291,24 +5285,6 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
 }

-/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
-/// instructions. This is used because creating CONCAT_VECTOR nodes of
-/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
-/// large BUILD_VECTORS.
-static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
-                                   unsigned NumElems, SelectionDAG &DAG,
-                                   const SDLoc &dl) {
-  SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
-  return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
-}
-
-static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
-                                   unsigned NumElems, SelectionDAG &DAG,
-                                   const SDLoc &dl) {
-  SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
-  return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
-}
-
 static SDValue concatSubVectors(SDValue V1, SDValue V2, EVT VT,
                                unsigned NumElems, SelectionDAG &DAG,
                                const SDLoc &dl, unsigned VectorWidth) {
@ -8609,30 +8585,63 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {

 // 256-bit AVX can use the vinsertf128 instruction
 // to create 256-bit vectors from two other 128-bit ones.
-static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
+                                      const X86Subtarget &Subtarget) {
  SDLoc dl(Op);
  MVT ResVT = Op.getSimpleValueType();

  assert((ResVT.is256BitVector() ||
          ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");

-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
-  unsigned NumElems = ResVT.getVectorNumElements();
-  if (ResVT.is256BitVector())
-    return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
+  unsigned NumOperands = Op.getNumOperands();
+  unsigned NumZero = 0;
+  unsigned NumNonZero = 0;
+  unsigned NonZeros = 0;
+  for (unsigned i = 0; i != NumOperands; ++i) {
+    SDValue SubVec = Op.getOperand(i);
+    if (SubVec.isUndef())
+      continue;
+    if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
+      ++NumZero;
+    else {
+      assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
+      NonZeros |= 1 << i;
+      ++NumNonZero;
+    }
+  }

-  if (Op.getNumOperands() == 4) {
+  // If there are zero or one non-zeros we can handle this very simply.
+  if (NumNonZero <= 1) {
+    SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
+                          : DAG.getUNDEF(ResVT);
+    if (!NumNonZero)
+      return Vec;
+    unsigned Idx = countTrailingZeros(NonZeros);
+    SDValue SubVec = Op.getOperand(Idx);
+    unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
+                       DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
+  }
+
+  if (NumOperands > 2) {
    MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
                                  ResVT.getVectorNumElements()/2);
-    SDValue V3 = Op.getOperand(2);
-    SDValue V4 = Op.getOperand(3);
-    return concat256BitVectors(
-        concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
-        concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
-        NumElems, DAG, dl);
+    ArrayRef<SDUse> Ops = Op->ops();
+    SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
+                             Ops.slice(0, NumOperands/2));
+    SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
+                             Ops.slice(NumOperands/2));
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
  }
-  return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
+
+  assert(NumNonZero == 2 && "Simple cases not handled?");
+
+  SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
+                            DAG.getUNDEF(ResVT), Op.getOperand(0),
+                            DAG.getIntPtrConstant(0, dl));
+  unsigned NumElems = ResVT.getVectorNumElements();
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
+                     DAG.getIntPtrConstant(NumElems/2, dl));
 }

 // Return true if all the operands of the given CONCAT_VECTORS node are zeros
@ -8689,6 +8698,7 @@ static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
  return SDValue();
 }

+// TODO: Merge this with LowerAVXCONCAT_VECTORS?
 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
                                       const X86Subtarget &Subtarget,
                                       SelectionDAG & DAG) {
@ -8775,7 +8785,7 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op,
  // from two other 128-bit ones.

  // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
-  return LowerAVXCONCAT_VECTORS(Op, DAG);
+  return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
 }

 //===----------------------------------------------------------------------===//
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@ -3871,9 +3871,7 @@ multiclass avx512_store_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _,

 def : Pat<(masked_store addr:$dst, Mask,
             (_.info512.VT (insert_subvector undef,
-                               (_.info256.VT (insert_subvector undef,
-                                                 (_.info128.VT _.info128.RC:$src),
-                                                 (iPTR 0))),
+                               (_.info128.VT _.info128.RC:$src),
                               (iPTR 0)))),
          (!cast<Instruction>(InstrStr#mrk) addr:$dst,
                      (COPY_TO_REGCLASS MaskRC:$mask, VK1WM),
@ -3888,9 +3886,7 @@ multiclass avx512_store_scalar_lowering_subreg<string InstrStr,

 def : Pat<(masked_store addr:$dst, Mask,
             (_.info512.VT (insert_subvector undef,
-                               (_.info256.VT (insert_subvector undef,
-                                                 (_.info128.VT _.info128.RC:$src),
-                                                 (iPTR 0))),
+                               (_.info128.VT _.info128.RC:$src),
                               (iPTR 0)))),
          (!cast<Instruction>(InstrStr#mrk) addr:$dst,
                      (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
@ -3913,9 +3909,7 @@ def : Pat<(_.info128.VT (extract_subvector
 def : Pat<(_.info128.VT (extract_subvector
                (_.info512.VT (masked_load addr:$srcAddr, Mask,
                      (_.info512.VT (insert_subvector undef,
-                            (_.info256.VT (insert_subvector undef,
-                                  (_.info128.VT (X86vzmovl _.info128.RC:$src)),
-                                  (iPTR 0))),
+                            (_.info128.VT (X86vzmovl _.info128.RC:$src)),
                            (iPTR 0))))),
                (iPTR 0))),
          (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
@ -3941,9 +3935,7 @@ def : Pat<(_.info128.VT (extract_subvector
 def : Pat<(_.info128.VT (extract_subvector
                (_.info512.VT (masked_load addr:$srcAddr, Mask,
                      (_.info512.VT (insert_subvector undef,
-                            (_.info256.VT (insert_subvector undef,
-                                  (_.info128.VT (X86vzmovl _.info128.RC:$src)),
-                                  (iPTR 0))),
+                            (_.info128.VT (X86vzmovl _.info128.RC:$src)),
                            (iPTR 0))))),
                (iPTR 0))),
          (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@ -1764,15 +1764,11 @@ define <16 x float> @test_mm512_zextps128_ps512(<4 x float> %a0) nounwind {
 ; X32-LABEL: test_mm512_zextps128_ps512:
 ; X32:       # %bb.0:
 ; X32-NEXT:    vmovaps %xmm0, %xmm0
-; X32-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X32-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm512_zextps128_ps512:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovaps %xmm0, %xmm0
-; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
 ; X64-NEXT:    retq
  %res = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
  ret <16 x float> %res
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@ -3054,7 +3054,6 @@ declare <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float>, <4 x fl
 define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3, i16 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_512:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
 ; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm3
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
@ -3075,7 +3074,6 @@ declare <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32>, <4 x i32>,
 define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_512:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
 ; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm3
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}