Adapt the x86 build_vector dagcombine to the current state of the legalizer.

build vectors with i64 elements will only appear on 32b x86 before legalize.
Since vector widening occurs during legalize, and produces i64 build_vector 
elements, the dag combiner is never run on these before legalize splits them
into 32b elements.

Teach the build_vector dag combine in x86 back end to recognize consecutive 
loads producing the low part of the vector.

Convert the two uses of TLI's consecutive load recognizer to pass LoadSDNodes
since that was required implicitly.

Add a testcase for the transform.

Old:
	subl	$28, %esp
	movl	32(%esp), %eax
	movl	4(%eax), %ecx
	movl	%ecx, 4(%esp)
	movl	(%eax), %eax
	movl	%eax, (%esp)
	movaps	(%esp), %xmm0
	pmovzxwd	%xmm0, %xmm0
	movl	36(%esp), %eax
	movaps	%xmm0, (%eax)
	addl	$28, %esp
	ret

New:
	movl	4(%esp), %eax
	pmovzxwd	(%eax), %xmm0
	movl	8(%esp), %eax
	movaps	%xmm0, (%eax)
	ret

llvm-svn: 72957
This commit is contained in:
Nate Begeman 2009-06-05 21:37:30 +00:00
parent 3158790e32
commit 624690c6b2
5 changed files with 113 additions and 58 deletions

View File

@ -825,11 +825,11 @@ public:
virtual bool virtual bool
isGAPlusOffset(SDNode *N, GlobalValue* &GA, int64_t &Offset) const; isGAPlusOffset(SDNode *N, GlobalValue* &GA, int64_t &Offset) const;
/// isConsecutiveLoad - Return true if LD (which must be a LoadSDNode) is /// isConsecutiveLoad - Return true if LD is loading 'Bytes' bytes from a
/// loading 'Bytes' bytes from a location that is 'Dist' units away from the /// location that is 'Dist' units away from the location that the 'Base' load
/// location that the 'Base' load is loading from. /// is loading from.
bool isConsecutiveLoad(SDNode *LD, SDNode *Base, unsigned Bytes, int Dist, bool isConsecutiveLoad(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes,
const MachineFrameInfo *MFI) const; int Dist, const MachineFrameInfo *MFI) const;
/// PerformDAGCombine - This method will be invoked for all target nodes and /// PerformDAGCombine - This method will be invoked for all target nodes and
/// for any target-independent nodes that the target has registered with /// for any target-independent nodes that the target has registered with

View File

@ -3626,30 +3626,29 @@ static SDNode *getBuildPairElt(SDNode *N, unsigned i) {
SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, MVT VT) { SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, MVT VT) {
assert(N->getOpcode() == ISD::BUILD_PAIR); assert(N->getOpcode() == ISD::BUILD_PAIR);
SDNode *LD1 = getBuildPairElt(N, 0); LoadSDNode *LD1 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 0));
if (!ISD::isNON_EXTLoad(LD1) || !LD1->hasOneUse()) LoadSDNode *LD2 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 1));
if (!LD1 || !LD2 || !ISD::isNON_EXTLoad(LD1) || !LD1->hasOneUse())
return SDValue(); return SDValue();
MVT LD1VT = LD1->getValueType(0); MVT LD1VT = LD1->getValueType(0);
SDNode *LD2 = getBuildPairElt(N, 1);
const MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); const MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
if (ISD::isNON_EXTLoad(LD2) && if (ISD::isNON_EXTLoad(LD2) &&
LD2->hasOneUse() && LD2->hasOneUse() &&
// If both are volatile this would reduce the number of volatile loads. // If both are volatile this would reduce the number of volatile loads.
// If one is volatile it might be ok, but play conservative and bail out. // If one is volatile it might be ok, but play conservative and bail out.
!cast<LoadSDNode>(LD1)->isVolatile() && !LD1->isVolatile() &&
!cast<LoadSDNode>(LD2)->isVolatile() && !LD2->isVolatile() &&
TLI.isConsecutiveLoad(LD2, LD1, LD1VT.getSizeInBits()/8, 1, MFI)) { TLI.isConsecutiveLoad(LD2, LD1, LD1VT.getSizeInBits()/8, 1, MFI)) {
LoadSDNode *LD = cast<LoadSDNode>(LD1); unsigned Align = LD1->getAlignment();
unsigned Align = LD->getAlignment();
unsigned NewAlign = TLI.getTargetData()-> unsigned NewAlign = TLI.getTargetData()->
getABITypeAlignment(VT.getTypeForMVT()); getABITypeAlignment(VT.getTypeForMVT());
if (NewAlign <= Align && if (NewAlign <= Align &&
(!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT))) (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)))
return DAG.getLoad(VT, N->getDebugLoc(), LD->getChain(), LD->getBasePtr(), return DAG.getLoad(VT, N->getDebugLoc(), LD1->getChain(),
LD->getSrcValue(), LD->getSrcValueOffset(), LD1->getBasePtr(), LD1->getSrcValue(),
false, Align); LD1->getSrcValueOffset(), false, Align);
} }
return SDValue(); return SDValue();

View File

@ -2070,13 +2070,13 @@ bool TargetLowering::isGAPlusOffset(SDNode *N, GlobalValue* &GA,
} }
/// isConsecutiveLoad - Return true if LD (which must be a LoadSDNode) is /// isConsecutiveLoad - Return true if LD is loading 'Bytes' bytes from a
/// loading 'Bytes' bytes from a location that is 'Dist' units away from the /// location that is 'Dist' units away from the location that the 'Base' load
/// location that the 'Base' load is loading from. /// is loading from.
bool TargetLowering::isConsecutiveLoad(SDNode *LD, SDNode *Base, bool TargetLowering::isConsecutiveLoad(LoadSDNode *LD, LoadSDNode *Base,
unsigned Bytes, int Dist, unsigned Bytes, int Dist,
const MachineFrameInfo *MFI) const { const MachineFrameInfo *MFI) const {
if (LD->getOperand(0).getNode() != Base->getOperand(0).getNode()) if (LD->getChain() != Base->getChain())
return false; return false;
MVT VT = LD->getValueType(0); MVT VT = LD->getValueType(0);
if (VT.getSizeInBits() / 8 != Bytes) if (VT.getSizeInBits() / 8 != Bytes)
@ -2094,6 +2094,11 @@ bool TargetLowering::isConsecutiveLoad(SDNode *LD, SDNode *Base,
if (FS != BFS || FS != (int)Bytes) return false; if (FS != BFS || FS != (int)Bytes) return false;
return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes); return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes);
} }
if (Loc.getOpcode() == ISD::ADD && Loc.getOperand(0) == BaseLoc) {
ConstantSDNode *V = dyn_cast<ConstantSDNode>(Loc.getOperand(1));
if (V && (V->getSExtValue() == Dist*Bytes))
return true;
}
GlobalValue *GV1 = NULL; GlobalValue *GV1 = NULL;
GlobalValue *GV2 = NULL; GlobalValue *GV2 = NULL;

View File

@ -7675,8 +7675,9 @@ static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems,
if (Elt.getOpcode() == ISD::UNDEF) if (Elt.getOpcode() == ISD::UNDEF)
continue; continue;
if (!TLI.isConsecutiveLoad(Elt.getNode(), Base, LoadSDNode *LD = cast<LoadSDNode>(Elt);
EVT.getSizeInBits()/8, i, MFI)) LoadSDNode *LDBase = cast<LoadSDNode>(Base);
if (!TLI.isConsecutiveLoad(LD, LDBase, EVT.getSizeInBits()/8, i, MFI))
return false; return false;
} }
return true; return true;
@ -7751,11 +7752,15 @@ static SDValue PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG,
MVT VT = N->getValueType(0); MVT VT = N->getValueType(0);
MVT EVT = VT.getVectorElementType(); MVT EVT = VT.getVectorElementType();
// Before or during type legalization, we want to try and convert a
// build_vector of an i64 load and a zero value into vzext_movl before the
// legalizer can break it up.
// FIXME: does the case below remove the need to do this?
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) {
if ((EVT != MVT::i64 && EVT != MVT::f64) || Subtarget->is64Bit()) if ((EVT != MVT::i64 && EVT != MVT::f64) || Subtarget->is64Bit())
// We are looking for load i64 and zero extend. We want to transform
// it before legalizer has a chance to expand it. Also look for i64
// BUILD_PAIR bit casted to f64.
return SDValue(); return SDValue();
// This must be an insertion into a zero vector. // This must be an insertion into a zero vector.
SDValue HighElt = N->getOperand(1); SDValue HighElt = N->getOperand(1);
if (!isZeroNode(HighElt)) if (!isZeroNode(HighElt))
@ -7789,6 +7794,40 @@ static SDValue PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG,
TLO.CombineTo(SDValue(Base, 1), ResNode.getValue(1)); TLO.CombineTo(SDValue(Base, 1), ResNode.getValue(1));
DCI.CommitTargetLoweringOpt(TLO); DCI.CommitTargetLoweringOpt(TLO);
return ResNode; return ResNode;
}
// The type legalizer will have broken apart v2i64 build_vector created during
// widening before the code which handles that case is run. Look for build
// vector (load, load + 4, 0/undef, 0/undef)
if (VT == MVT::v4i32 || VT == MVT::v4f32) {
LoadSDNode *LD0 = dyn_cast<LoadSDNode>(N->getOperand(0));
LoadSDNode *LD1 = dyn_cast<LoadSDNode>(N->getOperand(1));
if (!LD0 || !LD1)
return SDValue();
if (LD0->getExtensionType() != ISD::NON_EXTLOAD ||
LD1->getExtensionType() != ISD::NON_EXTLOAD)
return SDValue();
// Make sure the second elt is a consecutive load.
if (!TLI.isConsecutiveLoad(LD1, LD0, EVT.getSizeInBits()/8, 1,
DAG.getMachineFunction().getFrameInfo()))
return SDValue();
SDValue N2 = N->getOperand(2);
SDValue N3 = N->getOperand(3);
if (!isZeroNode(N2) && N2.getOpcode() != ISD::UNDEF)
return SDValue();
if (!isZeroNode(N3) && N3.getOpcode() != ISD::UNDEF)
return SDValue();
SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
SDValue Ops[] = { LD0->getChain(), LD0->getBasePtr() };
SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
TargetLowering::TargetLoweringOpt TLO(DAG);
TLO.CombineTo(SDValue(LD0, 1), ResNode.getValue(1));
DCI.CommitTargetLoweringOpt(TLO);
return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode);
}
return SDValue();
} }
/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. /// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes.

View File

@ -1,13 +1,25 @@
; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -o %t -f ; RUN: llvm-as < %s | llc -march=x86 -mcpu=penryn -disable-mmx -o %t -f
; RUN: grep unpcklpd %t | count 1 ; RUN: grep unpcklpd %t | count 1
; RUN: grep movapd %t | count 1 ; RUN: grep movapd %t | count 1
; RUN: grep movaps %t | count 1
; Shows a dag combine bug that will generate an illegal build vector ; Shows a dag combine bug that will generate an illegal build vector
; with v2i64 build_vector i32, i32. ; with v2i64 build_vector i32, i32.
define void @test(<2 x double>* %dst, <4 x double> %src) { define void @test(<2 x double>* %dst, <4 x double> %src) nounwind {
entry: entry:
%tmp7.i = shufflevector <4 x double> %src, <4 x double> undef, <2 x i32> < i32 0, i32 2 > %tmp7.i = shufflevector <4 x double> %src, <4 x double> undef, <2 x i32> < i32 0, i32 2 >
store <2 x double> %tmp7.i, <2 x double>* %dst store <2 x double> %tmp7.i, <2 x double>* %dst
ret void ret void
} }
define void @test2(<4 x i16>* %src, <4 x i32>* %dest) nounwind {
entry:
%tmp1 = load <4 x i16>* %src
%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
%0 = tail call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %tmp3)
store <4 x i32> %0, <4 x i32>* %dest
ret void
}
declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone