[SDAG] Re-instate r215611 with a fix to a pesky X86 DAG combine.

This combine is essentially combining target-specific nodes back into target
independent nodes that it "knows" will be combined yet again by a target
independent DAG combine into a different set of target-independent nodes that
are legal (not custom though!) and thus "ok". This seems... deeply flawed. The
crux of the problem is that we don't combine un-legalized shuffles that are
introduced by legalizing other operations, and thus we don't see a very
profitable combine opportunity. So the backend just forces the input to that
combine to re-appear.

However, for this to work, the conditions detected to re-form the unlegalized
nodes must be *exactly* right. Previously, failing this would have caused poor
code (if you're lucky) or a crasher when we failed to select instructions.
After r215611 we would fall back into the legalizer. In some cases, this just
"fixed" the crasher by produces bad code. But in the test case added it caused
the legalizer and the dag combiner to iterate forever.

The fix is to make the alignment checking in the x86 side of things match the
alignment checking in the generic DAG combine exactly. This isn't really a
satisfying or principled fix, but it at least make the code work as intended.
It also highlights that it would be nice to detect the availability of under
aligned loads for a given type rather than bailing on this optimization. I've
left a FIXME to document this.

Original commit message for r215611 which covers the rest of the chang:
  [SDAG] Fix a case where we would iteratively legalize a node during
  combining by replacing it with something else but not re-process the
  node afterward to remove it.

  In a truly remarkable stroke of bad luck, this would (in the test case
  attached) end up getting some other node combined into it without ever
  getting re-processed. By adding it back on to the worklist, in addition
  to deleting the dead nodes more quickly we also ensure that if it
  *stops* being dead for any reason it makes it back through the
  legalizer. Without this, the test case will end up failing during
  instruction selection due to an and node with a type we don't have an
  instruction pattern for.

It took many million runs of the shuffle fuzz tester to find this.

llvm-svn: 216537
This commit is contained in:
Chandler Carruth 2014-08-27 11:22:16 +00:00
parent 4d04f66627
commit 74ec9e19ee
4 changed files with 32 additions and 17 deletions

View File

@ -156,6 +156,8 @@ public:
// Node replacement helpers // Node replacement helpers
void ReplacedNode(SDNode *N) { void ReplacedNode(SDNode *N) {
LegalizedNodes.erase(N); LegalizedNodes.erase(N);
if (UpdatedNodes)
UpdatedNodes->insert(N);
} }
void ReplaceNode(SDNode *Old, SDNode *New) { void ReplaceNode(SDNode *Old, SDNode *New) {
DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG); DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG);

View File

@ -19787,7 +19787,6 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
EVT VT = InVec.getValueType(); EVT VT = InVec.getValueType();
bool HasShuffleIntoBitcast = false;
if (InVec.getOpcode() == ISD::BITCAST) { if (InVec.getOpcode() == ISD::BITCAST) {
// Don't duplicate a load with other uses. // Don't duplicate a load with other uses.
if (!InVec.hasOneUse()) if (!InVec.hasOneUse())
@ -19796,7 +19795,6 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
if (BCVT.getVectorNumElements() != VT.getVectorNumElements()) if (BCVT.getVectorNumElements() != VT.getVectorNumElements())
return SDValue(); return SDValue();
InVec = InVec.getOperand(0); InVec = InVec.getOperand(0);
HasShuffleIntoBitcast = true;
} }
if (!isTargetShuffle(InVec.getOpcode())) if (!isTargetShuffle(InVec.getOpcode()))
@ -19839,17 +19837,16 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile()) if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
return SDValue(); return SDValue();
if (HasShuffleIntoBitcast) { EVT EltVT = N->getValueType(0);
// If there's a bitcast before the shuffle, check if the load type and // If there's a bitcast before the shuffle, check if the load type and
// alignment is valid. // alignment is valid.
unsigned Align = LN0->getAlignment(); unsigned Align = LN0->getAlignment();
const TargetLowering &TLI = DAG.getTargetLoweringInfo(); const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned NewAlign = TLI.getDataLayout()-> unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment(
getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext())); EltVT.getTypeForEVT(*DAG.getContext()));
if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
return SDValue(); return SDValue();
}
// All checks match so transform back to vector_shuffle so that DAG combiner // All checks match so transform back to vector_shuffle so that DAG combiner
// can finish the job // can finish the job

View File

@ -1,6 +1,8 @@
; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=yonah | FileCheck %s ; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=yonah | FileCheck %s
; RUN: llc < %s -march=x86-64 -mattr=+sse2 -mcpu=core2 | FileCheck %s ; RUN: llc < %s -march=x86-64 -mattr=+sse2 -mcpu=core2 | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
define i32 @t(<2 x i64>* %val) nounwind { define i32 @t(<2 x i64>* %val) nounwind {
; CHECK-LABEL: t: ; CHECK-LABEL: t:
; CHECK-NOT: movd ; CHECK-NOT: movd
@ -23,3 +25,22 @@ undef, i32 7, i32 9, i32 undef, i32 13, i32 15, i32 1, i32 3>
%y = extractelement <8 x i32> %Shuff68, i32 0 %y = extractelement <8 x i32> %Shuff68, i32 0
ret i32 %y ret i32 %y
} }
; This case could easily end up inf-looping in the DAG combiner due to an
; low alignment load of the vector which prevents us from reliably forming a
; narrow load.
; FIXME: It would be nice to detect whether the target has fast and legal
; unaligned loads and use them here.
define void @t3() {
; CHECK-LABEL: t3:
;
; This movs the entire vector, shuffling the high double down. If we fixed the
; FIXME above it would just move the high double directly.
; CHECK: movhpd %xmm
bb:
%tmp13 = load <2 x double>* undef, align 1
%.sroa.3.24.vec.extract = extractelement <2 x double> %tmp13, i32 1
store double %.sroa.3.24.vec.extract, double* undef, align 8
unreachable
}

View File

@ -1,11 +1,6 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSSE3
; XFAIL: *
; Only @stress_test0 is expected to fail, but XFAIL is not that selective. I
; expect this to be unxfailed soon enough that we won't regress the other tests
; in the interim.
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown" target triple = "x86_64-unknown-unknown"