forked from OSchip/llvm-project
[x86] Refactor the logic to form SHUFPS instruction patterns to lower
a generic vector shuffle mask into a helper that isn't specific to the other things that influence which choice is made or the specific types used with the instruction. No functionality changed. llvm-svn: 218215
This commit is contained in:
parent
33eda72802
commit
02f3554971
|
@ -7765,107 +7765,25 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
|
|||
DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
|
||||
}
|
||||
|
||||
/// \brief Lower 4-lane 32-bit floating point shuffles.
|
||||
/// \brief Lower a vector shuffle using the SHUFPS instruction.
|
||||
///
|
||||
/// Uses instructions exclusively from the floating point unit to minimize
|
||||
/// domain crossing penalties, as these are sufficient to implement all v4f32
|
||||
/// shuffles.
|
||||
static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
|
||||
const X86Subtarget *Subtarget,
|
||||
SelectionDAG &DAG) {
|
||||
SDLoc DL(Op);
|
||||
assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
|
||||
assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
|
||||
assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
|
||||
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
|
||||
ArrayRef<int> Mask = SVOp->getMask();
|
||||
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
|
||||
|
||||
/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
|
||||
/// It makes no assumptions about whether this is the *best* lowering, it simply
|
||||
/// uses it.
|
||||
static SDValue lowerVectorShuffleWithSHUPFS(SDLoc DL, MVT VT,
|
||||
ArrayRef<int> Mask, SDValue V1,
|
||||
SDValue V2, SelectionDAG &DAG) {
|
||||
SDValue LowV = V1, HighV = V2;
|
||||
int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
|
||||
|
||||
int NumV2Elements =
|
||||
std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
|
||||
|
||||
if (NumV2Elements == 0) {
|
||||
if (Subtarget->hasAVX()) {
|
||||
// If we have AVX, we can use VPERMILPS which will allow folding a load
|
||||
// into the shuffle.
|
||||
return DAG.getNode(X86ISD::VPERMILP, DL, MVT::v4f32, V1,
|
||||
getV4X86ShuffleImm8ForMask(Mask, DAG));
|
||||
}
|
||||
|
||||
// Otherwise, use a straight shuffle of a single input vector. We pass the
|
||||
// input vector to both operands to simulate this with a SHUFPS.
|
||||
return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
|
||||
getV4X86ShuffleImm8ForMask(Mask, DAG));
|
||||
}
|
||||
|
||||
// Use dedicated unpack instructions for masks that match their pattern.
|
||||
if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
|
||||
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
|
||||
if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
|
||||
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
|
||||
|
||||
// There are special ways we can lower some single-element blends. However, we
|
||||
// have custom ways we can lower more complex single-element blends below that
|
||||
// we defer to if both this and BLENDPS fail to match, so restrict this to
|
||||
// when the V2 input is targeting element 0 of the mask -- that is the fast
|
||||
// case here.
|
||||
if (NumV2Elements == 1 && Mask[0] >= 4)
|
||||
if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2,
|
||||
Mask, Subtarget, DAG))
|
||||
return V;
|
||||
|
||||
if (Subtarget->hasSSE41())
|
||||
if (SDValue Blend =
|
||||
lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, DAG))
|
||||
return Blend;
|
||||
|
||||
if (NumV2Elements == 1) {
|
||||
int V2Index =
|
||||
std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
|
||||
Mask.begin();
|
||||
|
||||
// Check for whether we can use INSERTPS to perform the blend. We only use
|
||||
// INSERTPS when the V1 elements are already in the correct locations
|
||||
// because otherwise we can just always use two SHUFPS instructions which
|
||||
// are much smaller to encode than a SHUFPS and an INSERTPS.
|
||||
if (Subtarget->hasSSE41()) {
|
||||
// When using INSERTPS we can zero any lane of the destination. Collect
|
||||
// the zero inputs into a mask and drop them from the lanes of V1 which
|
||||
// actually need to be present as inputs to the INSERTPS.
|
||||
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
|
||||
|
||||
// Synthesize a shuffle mask for the non-zero and non-v2 inputs.
|
||||
bool InsertNeedsShuffle = false;
|
||||
unsigned ZMask = 0;
|
||||
for (int i = 0; i < 4; ++i)
|
||||
if (i != V2Index) {
|
||||
if (Zeroable[i]) {
|
||||
ZMask |= 1 << i;
|
||||
} else if (Mask[i] != i) {
|
||||
InsertNeedsShuffle = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// We don't want to use INSERTPS or other insertion techniques if it will
|
||||
// require shuffling anyways.
|
||||
if (!InsertNeedsShuffle) {
|
||||
// If all of V1 is zeroable, replace it with undef.
|
||||
if ((ZMask | 1 << V2Index) == 0xF)
|
||||
V1 = DAG.getUNDEF(MVT::v4f32);
|
||||
|
||||
unsigned InsertPSMask = (Mask[V2Index] - 4) << 6 | V2Index << 4 | ZMask;
|
||||
assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
|
||||
|
||||
// Insert the V2 element into the desired position.
|
||||
return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
|
||||
DAG.getConstant(InsertPSMask, MVT::i8));
|
||||
}
|
||||
}
|
||||
|
||||
// Compute the index adjacent to V2Index and in the same half by toggling
|
||||
// the low bit.
|
||||
int V2AdjIndex = V2Index ^ 1;
|
||||
|
@ -7929,6 +7847,107 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
|
|||
getV4X86ShuffleImm8ForMask(NewMask, DAG));
|
||||
}
|
||||
|
||||
/// \brief Lower 4-lane 32-bit floating point shuffles.
|
||||
///
|
||||
/// Uses instructions exclusively from the floating point unit to minimize
|
||||
/// domain crossing penalties, as these are sufficient to implement all v4f32
|
||||
/// shuffles.
|
||||
static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
|
||||
const X86Subtarget *Subtarget,
|
||||
SelectionDAG &DAG) {
|
||||
SDLoc DL(Op);
|
||||
assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
|
||||
assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
|
||||
assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
|
||||
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
|
||||
ArrayRef<int> Mask = SVOp->getMask();
|
||||
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
|
||||
|
||||
int NumV2Elements =
|
||||
std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
|
||||
|
||||
if (NumV2Elements == 0) {
|
||||
if (Subtarget->hasAVX()) {
|
||||
// If we have AVX, we can use VPERMILPS which will allow folding a load
|
||||
// into the shuffle.
|
||||
return DAG.getNode(X86ISD::VPERMILP, DL, MVT::v4f32, V1,
|
||||
getV4X86ShuffleImm8ForMask(Mask, DAG));
|
||||
}
|
||||
|
||||
// Otherwise, use a straight shuffle of a single input vector. We pass the
|
||||
// input vector to both operands to simulate this with a SHUFPS.
|
||||
return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
|
||||
getV4X86ShuffleImm8ForMask(Mask, DAG));
|
||||
}
|
||||
|
||||
// Use dedicated unpack instructions for masks that match their pattern.
|
||||
if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
|
||||
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
|
||||
if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
|
||||
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
|
||||
|
||||
// There are special ways we can lower some single-element blends. However, we
|
||||
// have custom ways we can lower more complex single-element blends below that
|
||||
// we defer to if both this and BLENDPS fail to match, so restrict this to
|
||||
// when the V2 input is targeting element 0 of the mask -- that is the fast
|
||||
// case here.
|
||||
if (NumV2Elements == 1 && Mask[0] >= 4)
|
||||
if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2,
|
||||
Mask, Subtarget, DAG))
|
||||
return V;
|
||||
|
||||
if (Subtarget->hasSSE41())
|
||||
if (SDValue Blend =
|
||||
lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, DAG))
|
||||
return Blend;
|
||||
|
||||
// Check for whether we can use INSERTPS to perform the blend. We only use
|
||||
// INSERTPS when the V1 elements are already in the correct locations
|
||||
// because otherwise we can just always use two SHUFPS instructions which
|
||||
// are much smaller to encode than a SHUFPS and an INSERTPS.
|
||||
if (NumV2Elements == 1 && Subtarget->hasSSE41()) {
|
||||
int V2Index =
|
||||
std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
|
||||
Mask.begin();
|
||||
|
||||
// When using INSERTPS we can zero any lane of the destination. Collect
|
||||
// the zero inputs into a mask and drop them from the lanes of V1 which
|
||||
// actually need to be present as inputs to the INSERTPS.
|
||||
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
|
||||
|
||||
// Synthesize a shuffle mask for the non-zero and non-v2 inputs.
|
||||
bool InsertNeedsShuffle = false;
|
||||
unsigned ZMask = 0;
|
||||
for (int i = 0; i < 4; ++i)
|
||||
if (i != V2Index) {
|
||||
if (Zeroable[i]) {
|
||||
ZMask |= 1 << i;
|
||||
} else if (Mask[i] != i) {
|
||||
InsertNeedsShuffle = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// We don't want to use INSERTPS or other insertion techniques if it will
|
||||
// require shuffling anyways.
|
||||
if (!InsertNeedsShuffle) {
|
||||
// If all of V1 is zeroable, replace it with undef.
|
||||
if ((ZMask | 1 << V2Index) == 0xF)
|
||||
V1 = DAG.getUNDEF(MVT::v4f32);
|
||||
|
||||
unsigned InsertPSMask = (Mask[V2Index] - 4) << 6 | V2Index << 4 | ZMask;
|
||||
assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
|
||||
|
||||
// Insert the V2 element into the desired position.
|
||||
return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
|
||||
DAG.getConstant(InsertPSMask, MVT::i8));
|
||||
}
|
||||
}
|
||||
|
||||
// Otherwise fall back to a SHUFPS lowering strategy.
|
||||
return lowerVectorShuffleWithSHUPFS(DL, MVT::v4f32, Mask, V1, V2, DAG);
|
||||
}
|
||||
|
||||
/// \brief Lower 4-lane i32 vector shuffles.
|
||||
///
|
||||
/// We try to handle these with integer-domain shuffles where we can, but for
|
||||
|
|
Loading…
Reference in New Issue