Implement vector shift up / down and insert zero with ps{rl}lq / ps{rl}ldq.

llvm-svn: 51667
This commit is contained in:
Evan Cheng 2008-05-29 08:22:04 +00:00
parent f4aece5976
commit 5e28227dbd
10 changed files with 183 additions and 25 deletions

View File

@ -1853,10 +1853,17 @@ SDOperand SelectionDAG::getShuffleScalarElt(const SDNode *N, unsigned Idx) {
unsigned NumElems = PermMask.getNumOperands();
SDOperand V = (Idx < NumElems) ? N->getOperand(0) : N->getOperand(1);
Idx %= NumElems;
if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) {
return (Idx == 0)
? V.getOperand(0) : getNode(ISD::UNDEF, MVT::getVectorElementType(VT));
if (V.getOpcode() == ISD::BIT_CONVERT) {
V = V.getOperand(0);
if (MVT::getVectorNumElements(V.getValueType()) != NumElems)
return SDOperand();
}
if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
return (Idx == 0) ? V.getOperand(0)
: getNode(ISD::UNDEF, MVT::getVectorElementType(VT));
if (V.getOpcode() == ISD::BUILD_VECTOR)
return V.getOperand(Idx);
if (V.getOpcode() == ISD::VECTOR_SHUFFLE) {
SDOperand Elt = PermMask.getOperand(Idx);
if (Elt.getOpcode() == ISD::UNDEF)

View File

@ -2923,6 +2923,70 @@ static SDOperand getShuffleVectorZeroOrUndef(SDOperand V2, unsigned Idx,
return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask);
}
/// getNumOfConsecutiveZeros - Return the number of elements in a result of
/// a shuffle that is zero.
static
unsigned getNumOfConsecutiveZeros(SDOperand Op, SDOperand Mask,
unsigned NumElems, bool Low,
SelectionDAG &DAG) {
unsigned NumZeros = 0;
for (unsigned i = 0; i < NumElems; ++i) {
SDOperand Idx = Mask.getOperand(Low ? i : NumElems-i-1);
if (Idx.getOpcode() == ISD::UNDEF) {
++NumZeros;
continue;
}
unsigned Index = cast<ConstantSDNode>(Idx)->getValue();
SDOperand Elt = DAG.getShuffleScalarElt(Op.Val, Index);
if (Elt.Val && isZeroNode(Elt))
++NumZeros;
else
break;
}
return NumZeros;
}
/// isVectorShift - Returns true if the shuffle can be implemented as a
/// logical left or right shift of a vector.
static bool isVectorShift(SDOperand Op, SDOperand Mask, SelectionDAG &DAG,
bool &isLeft, SDOperand &ShVal, unsigned &ShAmt) {
unsigned NumElems = Mask.getNumOperands();
isLeft = true;
unsigned NumZeros= getNumOfConsecutiveZeros(Op, Mask, NumElems, true, DAG);
if (!NumZeros) {
isLeft = false;
NumZeros = getNumOfConsecutiveZeros(Op, Mask, NumElems, false, DAG);
if (!NumZeros)
return false;
}
bool SeenV1 = false;
bool SeenV2 = false;
for (unsigned i = NumZeros; i < NumElems; ++i) {
unsigned Val = isLeft ? (i - NumZeros) : i;
SDOperand Idx = Mask.getOperand(isLeft ? i : (i - NumZeros));
if (Idx.getOpcode() == ISD::UNDEF)
continue;
unsigned Index = cast<ConstantSDNode>(Idx)->getValue();
if (Index < NumElems)
SeenV1 = true;
else {
Index -= NumElems;
SeenV2 = true;
}
if (Index != Val)
return false;
}
if (SeenV1 && SeenV2)
return false;
ShVal = SeenV1 ? Op.getOperand(0) : Op.getOperand(1);
ShAmt = NumZeros;
return true;
}
/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
///
static SDOperand LowerBuildVectorv16i8(SDOperand Op, unsigned NonZeros,
@ -2995,6 +3059,20 @@ static SDOperand LowerBuildVectorv8i16(SDOperand Op, unsigned NonZeros,
return V;
}
/// getVShift - Return a vector logical shift node.
///
static SDOperand getVShift(bool isLeft, MVT::ValueType VT, SDOperand SrcOp,
unsigned NumBits, SelectionDAG &DAG,
const TargetLowering &TLI) {
bool isMMX = MVT::getSizeInBits(VT) == 64;
MVT::ValueType ShVT = isMMX ? MVT::v1i64 : MVT::v2i64;
unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
SrcOp = DAG.getNode(ISD::BIT_CONVERT, ShVT, SrcOp);
return DAG.getNode(ISD::BIT_CONVERT, VT,
DAG.getNode(Opc, ShVT, SrcOp,
DAG.getConstant(NumBits, TLI.getShiftAmountTy())));
}
SDOperand
X86TargetLowering::LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG) {
// All zero's are handled with pxor, all one's are handled with pcmpeqd.
@ -3091,6 +3169,15 @@ X86TargetLowering::LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG) {
return getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
Subtarget->hasSSE2(), DAG);
}
// Is it a vector logical left shift?
if (NumElems == 2 && Idx == 1 &&
isZeroNode(Op.getOperand(0)) && !isZeroNode(Op.getOperand(1))) {
unsigned NumBits = MVT::getSizeInBits(VT);
return getVShift(true, VT,
DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Op.getOperand(1)),
NumBits/2, DAG, *this);
}
if (IsAllConstants) // Otherwise, it's better to do a constpool load.
return SDOperand();
@ -3615,6 +3702,19 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) {
}
}
// Check if this can be converted into a logical shift.
bool isLeft = false;
unsigned ShAmt = 0;
SDOperand ShVal;
bool isShift = isVectorShift(Op, PermMask, DAG, isLeft, ShVal, ShAmt);
if (isShift && ShVal.hasOneUse()) {
// If the shifted value has multiple uses, it may be cheaper to use
// v_set0 + movlhps or movhlps, etc.
MVT::ValueType EVT = MVT::getVectorElementType(VT);
ShAmt *= MVT::getSizeInBits(EVT);
return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this);
}
if (X86::isMOVLMask(PermMask.Val)) {
if (V1IsUndef)
return V2;
@ -3634,6 +3734,13 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) {
ShouldXformToMOVLP(V1.Val, V2.Val, PermMask.Val))
return CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
if (isShift) {
// No better options. Use a vshl / vsrl.
MVT::ValueType EVT = MVT::getVectorElementType(VT);
ShAmt *= MVT::getSizeInBits(EVT);
return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this);
}
bool Commuted = false;
// FIXME: This should also accept a bitcast of a splat? Be careful, not
// 1,1,1,1 -> v8i16 though.
@ -5729,6 +5836,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
case X86ISD::VSHL: return "X86ISD::VSHL";
case X86ISD::VSRL: return "X86ISD::VSRL";
}
}
@ -6296,8 +6405,10 @@ static SDOperand PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
static SDOperand PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget,
const TargetLowering &TLI) {
unsigned NumOps = N->getNumOperands();
// Ignore single operand BUILD_VECTOR.
if (N->getNumOperands() == 1)
if (NumOps == 1)
return SDOperand();
MVT::ValueType VT = N->getValueType(0);

View File

@ -205,7 +205,10 @@ namespace llvm {
VZEXT_MOVL,
// VZEXT_LOAD - Load, scalar_to_vector, and zero extend.
VZEXT_LOAD
VZEXT_LOAD,
// VSHL, VSRL - Vector logical left / right shift.
VSHL, VSRL
};
}

View File

@ -294,6 +294,12 @@ defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
int_x86_mmx_psra_d, int_x86_mmx_psrai_d>;
// Shift up / down and insert zero's.
def : Pat<(v1i64 (X86vshl VR64:$src, (i8 imm:$amt))),
(v1i64 (MMX_PSLLQri VR64:$src, imm:$amt))>;
def : Pat<(v1i64 (X86vshr VR64:$src, (i8 imm:$amt))),
(v1i64 (MMX_PSRLQri VR64:$src, imm:$amt))>;
// Comparison Instructions
defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b>;
defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w>;

View File

@ -51,6 +51,8 @@ def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL",
SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
[SDNPHasChain, SDNPMayLoad]>;
def X86vshl : SDNode<"X86ISD::VSHL", SDTIntShiftOp>;
def X86vshr : SDNode<"X86ISD::VSRL", SDTIntShiftOp>;
//===----------------------------------------------------------------------===//
// SSE Complex Patterns
@ -1957,6 +1959,12 @@ let Predicates = [HasSSE2] in {
(v2i64 (PSRLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>;
def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
(v2f64 (PSRLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>;
// Shift up / down and insert zero's.
def : Pat<(v2i64 (X86vshl VR128:$src, (i8 imm:$amt))),
(v2i64 (PSLLDQri VR128:$src, (PSxLDQ_imm imm:$amt)))>;
def : Pat<(v2i64 (X86vshr VR128:$src, (i8 imm:$amt))),
(v2i64 (PSRLDQri VR128:$src, (PSxLDQ_imm imm:$amt)))>;
}
// Logical

View File

@ -1,23 +1,7 @@
; RUN: llvm-as < %s | llc -march=x86 -mattr=+mmx | grep movq | count 3
; FIXME: This code outputs:
;
; subl $28, %esp
; movl 32(%esp), %eax
; movd %eax, %mm0
; movq %mm0, (%esp)
; movl (%esp), %eax
; movl %eax, 20(%esp)
; movq %mm0, 8(%esp)
; movl 12(%esp), %eax
; movl %eax, 16(%esp)
; movq 16(%esp), %mm0
; addl $28, %esp
;
; Which is ugly. We need to fix this.
; RUN: llvm-as < %s | llc -march=x86 -mattr=+mmx | not grep movq
; RUN: llvm-as < %s | llc -march=x86 -mattr=+mmx | grep psllq
define <2 x i32> @qux(i32 %A) nounwind {
entry:
%tmp3 = insertelement <2 x i32> < i32 0, i32 undef >, i32 %A, i32 1 ; <<2 x i32>> [#uses=1]
ret <2 x i32> %tmp3
}

View File

@ -1,6 +1,7 @@
; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin | not grep and
; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin | grep psrldq
define <4 x float> @test(<4 x float>* %v1) {
define <4 x float> @test(<4 x float>* %v1) nounwind {
%tmp = load <4 x float>* %v1 ; <<4 x float>> [#uses=1]
%tmp15 = bitcast <4 x float> %tmp to <2 x i64> ; <<2 x i64>> [#uses=1]
%tmp24 = and <2 x i64> %tmp15, bitcast (<4 x i32> < i32 0, i32 0, i32 -1, i32 -1 > to <2 x i64>) ; <<2 x i64>> [#uses=1]

View File

@ -1,6 +1,6 @@
; RUN: llvm-as < %s | llc -march=x86-64 -mattr=+sse2 | grep punpcklqdq | count 1
define <2 x i64> @t1(i64 %s, <2 x i64> %tmp) {
define <2 x i64> @t1(i64 %s, <2 x i64> %tmp) nounwind {
%tmp1 = insertelement <2 x i64> %tmp, i64 %s, i32 1
ret <2 x i64> %tmp1
}

View File

@ -0,0 +1,31 @@
; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep psllq | grep 32
; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pslldq | grep 12
; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep psrldq | grep 8
; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep psrldq | grep 12
define void @t1(i32 %a, <1 x i64>* %P) nounwind {
%tmp12 = shl i32 %a, 12
%tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1
%tmp22 = insertelement <2 x i32> %tmp21, i32 0, i32 0
%tmp23 = bitcast <2 x i32> %tmp22 to <1 x i64>
store <1 x i64> %tmp23, <1 x i64>* %P
ret void
}
define <4 x float> @t2(<4 x float>* %P) nounwind {
%tmp1 = load <4 x float>* %P
%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 4, i32 4, i32 4, i32 0 >
ret <4 x float> %tmp2
}
define <4 x float> @t3(<4 x float>* %P) nounwind {
%tmp1 = load <4 x float>* %P
%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 4, i32 4 >
ret <4 x float> %tmp2
}
define <4 x float> @t4(<4 x float>* %P) nounwind {
%tmp1 = load <4 x float>* %P
%tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 7, i32 0, i32 0, i32 0 >
ret <4 x float> %tmp2
}

View File

@ -0,0 +1,7 @@
; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | not grep pslldq
define <4 x float> @t3(<4 x float>* %P) nounwind {
%tmp1 = load <4 x float>* %P
%tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 4, i32 4, i32 4, i32 0 >
ret <4 x float> %tmp2
}