[x86] Refactor the logic to form SHUFPS instruction patterns to lower

a generic vector shuffle mask into a helper that isn't specific to the other things that influence which choice is made or the specific types used with the instruction. No functionality changed. llvm-svn: 218215
2014-09-21 13:03:00 +00:00 · 2014-09-21 13:03:00 +00:00 · 02f3554971
parent 33eda72802
commit 02f3554971
1 changed files with 108 additions and 89 deletions
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -7765,107 +7765,25 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                     DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
 }
-/// \brief Lower 4-lane 32-bit floating point shuffles.
+/// \brief Lower a vector shuffle using the SHUFPS instruction.
 ///
-/// Uses instructions exclusively from the floating point unit to minimize
+/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
-/// domain crossing penalties, as these are sufficient to implement all v4f32
+/// It makes no assumptions about whether this is the *best* lowering, it simply
-/// shuffles.
+/// uses it.
-static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+static SDValue lowerVectorShuffleWithSHUPFS(SDLoc DL, MVT VT,
-                                       const X86Subtarget *Subtarget,
+                                            ArrayRef<int> Mask, SDValue V1,
-                                       SelectionDAG &DAG) {
+                                            SDValue V2, SelectionDAG &DAG) {
  SDLoc DL(Op);
  assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
  assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
  assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
  ArrayRef<int> Mask = SVOp->getMask();
  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
  SDValue LowV = V1, HighV = V2;
  int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
  int NumV2Elements =
      std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
  if (NumV2Elements == 0) {
    if (Subtarget->hasAVX()) {
      // If we have AVX, we can use VPERMILPS which will allow folding a load
      // into the shuffle.
      return DAG.getNode(X86ISD::VPERMILP, DL, MVT::v4f32, V1,
                         getV4X86ShuffleImm8ForMask(Mask, DAG));
    }
    // Otherwise, use a straight shuffle of a single input vector. We pass the
    // input vector to both operands to simulate this with a SHUFPS.
    return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
                       getV4X86ShuffleImm8ForMask(Mask, DAG));
  }
  // Use dedicated unpack instructions for masks that match their pattern.
  if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
  if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
  // There are special ways we can lower some single-element blends. However, we
  // have custom ways we can lower more complex single-element blends below that
  // we defer to if both this and BLENDPS fail to match, so restrict this to
  // when the V2 input is targeting element 0 of the mask -- that is the fast
  // case here.
  if (NumV2Elements == 1 && Mask[0] >= 4)
    if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2,
                                                         Mask, Subtarget, DAG))
      return V;
  if (Subtarget->hasSSE41())
    if (SDValue Blend =
            lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, DAG))
      return Blend;
  if (NumV2Elements == 1) {
    int V2Index =
        std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
        Mask.begin();
    // Check for whether we can use INSERTPS to perform the blend. We only use
    // INSERTPS when the V1 elements are already in the correct locations
    // because otherwise we can just always use two SHUFPS instructions which
    // are much smaller to encode than a SHUFPS and an INSERTPS.
    if (Subtarget->hasSSE41()) {
      // When using INSERTPS we can zero any lane of the destination. Collect
      // the zero inputs into a mask and drop them from the lanes of V1 which
      // actually need to be present as inputs to the INSERTPS.
      SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
      // Synthesize a shuffle mask for the non-zero and non-v2 inputs.
      bool InsertNeedsShuffle = false;
      unsigned ZMask = 0;
      for (int i = 0; i < 4; ++i)
        if (i != V2Index) {
          if (Zeroable[i]) {
            ZMask |= 1 << i;
          } else if (Mask[i] != i) {
            InsertNeedsShuffle = true;
            break;
          }
        }
      // We don't want to use INSERTPS or other insertion techniques if it will
      // require shuffling anyways.
      if (!InsertNeedsShuffle) {
        // If all of V1 is zeroable, replace it with undef.
        if ((ZMask | 1 << V2Index) == 0xF)
          V1 = DAG.getUNDEF(MVT::v4f32);
        unsigned InsertPSMask = (Mask[V2Index] - 4) << 6 | V2Index << 4 | ZMask;
        assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
        // Insert the V2 element into the desired position.
        return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
                           DAG.getConstant(InsertPSMask, MVT::i8));
      }
    }
    // Compute the index adjacent to V2Index and in the same half by toggling
    // the low bit.
    int V2AdjIndex = V2Index ^ 1;
@ -7929,6 +7847,107 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                     getV4X86ShuffleImm8ForMask(NewMask, DAG));
 }
 /// \brief Lower 4-lane 32-bit floating point shuffles.
 ///
 /// Uses instructions exclusively from the floating point unit to minimize
 /// domain crossing penalties, as these are sufficient to implement all v4f32
 /// shuffles.
 static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                                       const X86Subtarget *Subtarget,
                                       SelectionDAG &DAG) {
  SDLoc DL(Op);
  assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
  assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
  assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
  ArrayRef<int> Mask = SVOp->getMask();
  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
  int NumV2Elements =
      std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
  if (NumV2Elements == 0) {
    if (Subtarget->hasAVX()) {
      // If we have AVX, we can use VPERMILPS which will allow folding a load
      // into the shuffle.
      return DAG.getNode(X86ISD::VPERMILP, DL, MVT::v4f32, V1,
                         getV4X86ShuffleImm8ForMask(Mask, DAG));
    }
    // Otherwise, use a straight shuffle of a single input vector. We pass the
    // input vector to both operands to simulate this with a SHUFPS.
    return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
                       getV4X86ShuffleImm8ForMask(Mask, DAG));
  }
  // Use dedicated unpack instructions for masks that match their pattern.
  if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
  if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
  // There are special ways we can lower some single-element blends. However, we
  // have custom ways we can lower more complex single-element blends below that
  // we defer to if both this and BLENDPS fail to match, so restrict this to
  // when the V2 input is targeting element 0 of the mask -- that is the fast
  // case here.
  if (NumV2Elements == 1 && Mask[0] >= 4)
    if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2,
                                                         Mask, Subtarget, DAG))
      return V;
  if (Subtarget->hasSSE41())
    if (SDValue Blend =
            lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, DAG))
      return Blend;
  // Check for whether we can use INSERTPS to perform the blend. We only use
  // INSERTPS when the V1 elements are already in the correct locations
  // because otherwise we can just always use two SHUFPS instructions which
  // are much smaller to encode than a SHUFPS and an INSERTPS.
  if (NumV2Elements == 1 && Subtarget->hasSSE41()) {
    int V2Index =
        std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
        Mask.begin();
    // When using INSERTPS we can zero any lane of the destination. Collect
    // the zero inputs into a mask and drop them from the lanes of V1 which
    // actually need to be present as inputs to the INSERTPS.
    SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
    // Synthesize a shuffle mask for the non-zero and non-v2 inputs.
    bool InsertNeedsShuffle = false;
    unsigned ZMask = 0;
    for (int i = 0; i < 4; ++i)
      if (i != V2Index) {
        if (Zeroable[i]) {
          ZMask |= 1 << i;
        } else if (Mask[i] != i) {
          InsertNeedsShuffle = true;
          break;
        }
      }
    // We don't want to use INSERTPS or other insertion techniques if it will
    // require shuffling anyways.
    if (!InsertNeedsShuffle) {
      // If all of V1 is zeroable, replace it with undef.
      if ((ZMask | 1 << V2Index) == 0xF)
        V1 = DAG.getUNDEF(MVT::v4f32);
      unsigned InsertPSMask = (Mask[V2Index] - 4) << 6 | V2Index << 4 | ZMask;
      assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
      // Insert the V2 element into the desired position.
      return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
                         DAG.getConstant(InsertPSMask, MVT::i8));
    }
  }
  // Otherwise fall back to a SHUFPS lowering strategy.
  return lowerVectorShuffleWithSHUPFS(DL, MVT::v4f32, Mask, V1, V2, DAG);
 }
 /// \brief Lower 4-lane i32 vector shuffles.
 ///
 /// We try to handle these with integer-domain shuffles where we can, but for