Revamp build_vector lowering to take advantage of movss and movd instructions.

movd always clear the top 96 bits and movss does so when it's loading the
value from memory.
The net result is codegen for 4-wide shuffles is much improved. It is near
optimal if one or more elements is a zero. e.g.

__m128i test(int a, int b) {
  return _mm_set_epi32(0, 0, b, a);
}

compiles to

_test:
	movd 8(%esp), %xmm1
	movd 4(%esp), %xmm0
	punpckldq %xmm1, %xmm0
	ret

compare to gcc:

_test:
	subl	$12, %esp
	movd	20(%esp), %xmm0
	movd	16(%esp), %xmm1
	punpckldq	%xmm0, %xmm1
	movq	%xmm1, %xmm0
	movhps	LC0, %xmm0
	addl	$12, %esp
	ret

or icc:

_test:
        movd      4(%esp), %xmm0                                #5.10
        movd      8(%esp), %xmm3                                #5.10
        xorl      %eax, %eax                                    #5.10
        movd      %eax, %xmm1                                   #5.10
        punpckldq %xmm1, %xmm0                                  #5.10
        movd      %eax, %xmm2                                   #5.10
        punpckldq %xmm2, %xmm3                                  #5.10
        punpckldq %xmm3, %xmm0                                  #5.10
        ret                                                     #5.10

There are still room for improvement, for example the FP variant of the above example:

__m128 test(float a, float b) {
  return _mm_set_ps(0.0, 0.0, b, a);
}

_test:
	movss 8(%esp), %xmm1
	movss 4(%esp), %xmm0
	unpcklps %xmm1, %xmm0
	xorps %xmm1, %xmm1
	movlhps %xmm1, %xmm0
	ret

The xorps and movlhps are unnecessary. This will require post legalizer optimization to handle.

llvm-svn: 27939
This commit is contained in:
Evan Cheng 2006-04-21 23:03:30 +00:00
parent 57a32f0bc1
commit 14215c36b6
1 changed files with 140 additions and 63 deletions

View File

@ -2138,19 +2138,19 @@ static inline bool isZeroNode(SDOperand Elt) {
cast<ConstantFPSDNode>(Elt)->isExactlyValue(0.0))); cast<ConstantFPSDNode>(Elt)->isExactlyValue(0.0)));
} }
/// getShuffleVectorAgainstZero - Return a vector_shuffle of a zero vector and /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
/// the specified vector. /// vector and zero or undef vector.
static SDOperand getShuffleVectorAgainstZero(SDOperand Vec, MVT::ValueType VT, static SDOperand getShuffleVectorZeroOrUndef(SDOperand V2, MVT::ValueType VT,
unsigned NumElems, unsigned Idx, unsigned NumElems, unsigned Idx,
SelectionDAG &DAG) { bool isZero, SelectionDAG &DAG) {
SDOperand ZeroV = getZeroVector(VT, DAG); SDOperand V1 = isZero ? getZeroVector(VT, DAG) : DAG.getNode(ISD::UNDEF, VT);
MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems); MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
MVT::ValueType EVT = MVT::getVectorBaseType(MaskVT); MVT::ValueType EVT = MVT::getVectorBaseType(MaskVT);
SDOperand Zero = DAG.getConstant(0, EVT); SDOperand Zero = DAG.getConstant(0, EVT);
std::vector<SDOperand> MaskVec(NumElems, Zero); std::vector<SDOperand> MaskVec(NumElems, Zero);
MaskVec[Idx] = DAG.getConstant(NumElems, EVT); MaskVec[Idx] = DAG.getConstant(NumElems, EVT);
SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, MaskVec); SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, MaskVec);
return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, ZeroV, Vec, Mask); return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask);
} }
/// LowerOperation - Provide custom lowering hooks for some operations. /// LowerOperation - Provide custom lowering hooks for some operations.
@ -3005,7 +3005,7 @@ SDOperand X86TargetLowering::LowerOperation(SDOperand Op, SelectionDAG &DAG) {
if (X86::isUNPCKL_v_undef_Mask(PermMask.Val) || if (X86::isUNPCKL_v_undef_Mask(PermMask.Val) ||
X86::isUNPCKLMask(PermMask.Val) || X86::isUNPCKLMask(PermMask.Val) ||
X86::isUNPCKHMask(PermMask.Val, V2IsSplat)) X86::isUNPCKHMask(PermMask.Val))
return Op; return Op;
if (V2IsSplat) { if (V2IsSplat) {
@ -3137,82 +3137,159 @@ SDOperand X86TargetLowering::LowerOperation(SDOperand Op, SelectionDAG &DAG) {
return Op; return Op;
unsigned NumElems = Op.getNumOperands(); unsigned NumElems = Op.getNumOperands();
unsigned Half = NumElems/2;
MVT::ValueType VT = Op.getValueType(); MVT::ValueType VT = Op.getValueType();
MVT::ValueType EVT = MVT::getVectorBaseType(VT); MVT::ValueType EVT = MVT::getVectorBaseType(VT);
std::vector<unsigned> NonZeros; unsigned NumZero = 0;
unsigned NonZeros = 0;
std::set<SDOperand> Values; std::set<SDOperand> Values;
for (unsigned i = 0; i < NumElems; ++i) { for (unsigned i = 0; i < NumElems; ++i) {
unsigned Idx = NumElems - i - 1; SDOperand Elt = Op.getOperand(i);
SDOperand Elt = Op.getOperand(Idx);
Values.insert(Elt); Values.insert(Elt);
if (!isZeroNode(Elt)) if (isZeroNode(Elt))
NonZeros.push_back(Idx); NumZero++;
else if (Elt.getOpcode() != ISD::UNDEF)
NonZeros |= (1 << i);
} }
if (NonZeros.size() == 0) unsigned NumNonZero = CountPopulation_32(NonZeros);
if (NumNonZero == 0)
return Op; return Op;
if (NonZeros.size() == 1) { // Splat is obviously ok. Let legalizer expand it to a shuffle.
unsigned Idx = NonZeros[0]; if (Values.size() == 1)
SDOperand Item = Op.getOperand(Idx); return SDOperand();
if (Idx == 0 || MVT::getSizeInBits(EVT) >= 32)
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR,VT, Item);
if (Idx == 0)
return getShuffleVectorAgainstZero(Item, VT, NumElems, Idx, DAG);
// If element VT is < 32, convert it to a insert into a zero vector. // If element VT is >= 32 bits, turn it into a number of shuffles.
if (MVT::getSizeInBits(EVT) <= 16) { if (NumNonZero == 1) {
SDOperand ZeroV; unsigned Idx = CountTrailingZeros_32(NonZeros);
if (EVT == MVT::i8) { SDOperand Item = Op.getOperand(Idx);
Item = DAG.getNode(ISD::ANY_EXTEND, MVT::i16, Item); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Item);
if ((Idx % 2) != 0) if (Idx == 0)
Item = DAG.getNode(ISD::SHL, MVT::i16, // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
Item, DAG.getConstant(8, MVT::i8)); return getShuffleVectorZeroOrUndef(Item, VT, NumElems, Idx,
Idx /= 2; NumZero > 0, DAG);
ZeroV = getZeroVector(MVT::v8i16, DAG);
return DAG.getNode(ISD::BIT_CONVERT, VT, if (MVT::getSizeInBits(EVT) >= 32) {
DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, ZeroV, Item, // Turn it into a shuffle of zero and zero-extended scalar to vector.
DAG.getConstant(Idx, MVT::i32))); Item = getShuffleVectorZeroOrUndef(Item, VT, NumElems, 0, NumZero > 0,
} else { DAG);
ZeroV = getZeroVector(VT, DAG); MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
return DAG.getNode(ISD::INSERT_VECTOR_ELT, VT, ZeroV, Item, MVT::ValueType MaskEVT = MVT::getVectorBaseType(MaskVT);
DAG.getConstant(Idx, MVT::i32)); std::vector<SDOperand> MaskVec;
for (unsigned i = 0; i < NumElems; i++)
MaskVec.push_back(DAG.getConstant((i == Idx) ? 0 : 1, MaskEVT));
SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, MaskVec);
return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, Item,
DAG.getNode(ISD::UNDEF, VT), Mask);
}
}
// If element VT is < 32 bits, convert it to inserts into a zero vector.
if (MVT::getSizeInBits(EVT) <= 16) {
if (NumNonZero <= Half) {
SDOperand V(0, 0);
for (unsigned i = 0; i < NumNonZero; ++i) {
unsigned Idx = CountTrailingZeros_32(NonZeros);
NonZeros ^= (1 << Idx);
SDOperand Item = Op.getOperand(Idx);
if (i == 0) {
if (NumZero)
V = getZeroVector(MVT::v8i16, DAG);
else
V = DAG.getNode(ISD::UNDEF, MVT::v8i16);
}
if (EVT == MVT::i8) {
Item = DAG.getNode(ISD::ANY_EXTEND, MVT::i16, Item);
if ((Idx % 2) != 0)
Item = DAG.getNode(ISD::SHL, MVT::i16,
Item, DAG.getConstant(8, MVT::i8));
Idx /= 2;
}
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, V, Item,
DAG.getConstant(Idx, MVT::i32));
}
if (EVT == MVT::i8)
V = DAG.getNode(ISD::BIT_CONVERT, VT, V);
return V;
}
}
std::vector<SDOperand> V(NumElems);
if (NumElems == 4 && NumZero > 0) {
for (unsigned i = 0; i < 4; ++i) {
bool isZero = !(NonZeros & (1 << i));
if (isZero)
V[i] = getZeroVector(VT, DAG);
else
V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Op.getOperand(i));
}
for (unsigned i = 0; i < 2; ++i) {
switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
default: break;
case 0:
V[i] = V[i*2]; // Must be a zero vector.
break;
case 1:
V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i*2+1], V[i*2],
getMOVLMask(NumElems, DAG));
break;
case 2:
V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i*2], V[i*2+1],
getMOVLMask(NumElems, DAG));
break;
case 3:
V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i*2], V[i*2+1],
getUnpacklMask(NumElems, DAG));
break;
} }
} }
// Turn it into a shuffle of zero and zero-extended scalar to vector. // Take advantage of the fact R32 to VR128 scalar_to_vector (i.e. movd)
Item = getShuffleVectorAgainstZero(Item, VT, NumElems, 0, DAG); // clears the upper bits.
// FIXME: we can do the same for v4f32 case when we know both parts of
// the lower half come from scalar_to_vector (loadf32). We should do
// that in post legalizer dag combiner with target specific hooks.
if (MVT::isInteger(EVT) && (NonZeros & (0x3 << 2)) == 0)
return V[0];
MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems); MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
MVT::ValueType MaskEVT = MVT::getVectorBaseType(MaskVT); MVT::ValueType EVT = MVT::getVectorBaseType(MaskVT);
std::vector<SDOperand> MaskVec; std::vector<SDOperand> MaskVec;
for (unsigned i = 0; i < NumElems; i++) bool Reverse = (NonZeros & 0x3) == 2;
MaskVec.push_back(DAG.getConstant((i == Idx) ? 0 : 1, MaskEVT)); for (unsigned i = 0; i < 2; ++i)
SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, MaskVec); if (Reverse)
return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, Item, MaskVec.push_back(DAG.getConstant(1-i, EVT));
DAG.getNode(ISD::UNDEF, VT), Mask); else
MaskVec.push_back(DAG.getConstant(i, EVT));
Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2;
for (unsigned i = 0; i < 2; ++i)
if (Reverse)
MaskVec.push_back(DAG.getConstant(1-i+NumElems, EVT));
else
MaskVec.push_back(DAG.getConstant(i+NumElems, EVT));
SDOperand ShufMask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, MaskVec);
return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[0], V[1], ShufMask);
} }
if (Values.size() > 2) { // Expand into a number of unpckl*.
// Expand into a number of unpckl*. // e.g. for v4f32
// e.g. for v4f32 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
// Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
// : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> // Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
// Step 2: unpcklps X, Y ==> <3, 2, 1, 0> SDOperand UnpckMask = getUnpacklMask(NumElems, DAG);
SDOperand PermMask = getUnpacklMask(NumElems, DAG); for (unsigned i = 0; i < NumElems; ++i)
std::vector<SDOperand> V(NumElems); V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Op.getOperand(i));
NumElems >>= 1;
while (NumElems != 0) {
for (unsigned i = 0; i < NumElems; ++i) for (unsigned i = 0; i < NumElems; ++i)
V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Op.getOperand(i)); V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i], V[i + NumElems],
UnpckMask);
NumElems >>= 1; NumElems >>= 1;
while (NumElems != 0) {
for (unsigned i = 0; i < NumElems; ++i)
V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i], V[i + NumElems],
PermMask);
NumElems >>= 1;
}
return V[0];
} }
return V[0];
return SDOperand();
} }
case ISD::EXTRACT_VECTOR_ELT: { case ISD::EXTRACT_VECTOR_ELT: {
if (!isa<ConstantSDNode>(Op.getOperand(1))) if (!isa<ConstantSDNode>(Op.getOperand(1)))